3450 lines
121 KiB
C
3450 lines
121 KiB
C
// The basic of 16-bit Galois Field arithmetic is based on Galois.c by James S. Plank.
|
||
// Modified by Yutaka Sawada to support MMX, SSE2, and SSSE3.
|
||
|
||
/* Galois.c
|
||
* James S. Plank
|
||
|
||
Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
|
||
|
||
Revision 1.2A
|
||
May 24, 2011
|
||
|
||
James S. Plank
|
||
Department of Electrical Engineering and Computer Science
|
||
University of Tennessee
|
||
Knoxville, TN 37996
|
||
plank@cs.utk.edu
|
||
|
||
Copyright (c) 2011, James S. Plank
|
||
All rights reserved.
|
||
|
||
Redistribution and use in source and binary forms, with or without
|
||
modification, are permitted provided that the following conditions
|
||
are met:
|
||
|
||
- Redistributions of source code must retain the above copyright
|
||
notice, this list of conditions and the following disclaimer.
|
||
|
||
- Redistributions in binary form must reproduce the above copyright
|
||
notice, this list of conditions and the following disclaimer in
|
||
the documentation and/or other materials provided with the
|
||
distribution.
|
||
|
||
- Neither the name of the University of Tennessee nor the names of its
|
||
contributors may be used to endorse or promote products derived
|
||
from this software without specific prior written permission.
|
||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||
AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
|
||
WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||
POSSIBILITY OF SUCH DAMAGE.
|
||
|
||
*/
|
||
|
||
#define _WIN32_WINNT 0x0601 // Windows 7 or later
|
||
|
||
#include <stdlib.h>
|
||
#include <stdio.h>
|
||
|
||
#include <windows.h>
|
||
#include <intrin.h> // 組み込み関数(intrinsic)を使用する場合インクルード
|
||
|
||
#include "gf16.h"
|
||
#include "gf_jit.h" // ParPar の JIT コード用
|
||
|
||
extern unsigned int cpu_flag; // declared in common2.h
|
||
|
||
#ifndef _WIN64 // 32-bit 版なら
|
||
#pragma warning(disable:4731) // inhibit VC's "ebp modified" warning
|
||
#pragma warning(disable:4799) // inhibit VC's "missing emms" warning
|
||
#endif
|
||
|
||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||
// CPU によって使う関数を変更する際の仮宣言
|
||
|
||
//#define NO_SIMD // SIMD を使わない場合
|
||
|
||
int sse_unit;
|
||
|
||
void galois_align16_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
|
||
void galois_align32_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
|
||
void galois_align32avx_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
|
||
void galois_align256_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
|
||
|
||
void galois_align32_multiply2(unsigned char *src1, unsigned char *src2, unsigned char *dst, unsigned int len, int factor1, int factor2);
|
||
void galois_align32avx_multiply2(unsigned char *src1, unsigned char *src2, unsigned char *dst, unsigned int len, int factor1, int factor2);
|
||
|
||
void galois_altmap_none(unsigned char *data, unsigned int bsize);
|
||
|
||
// AVX2 と SSSE3 の ALTMAP は 32バイト単位で行う
|
||
void galois_altmap32_change(unsigned char *data, unsigned int bsize);
|
||
void galois_altmap32_return(unsigned char *data, unsigned int bsize);
|
||
void checksum16_altmap32(unsigned char *data, unsigned char *hash, int byte_size);
|
||
void checksum16_return32(unsigned char *data, unsigned char *hash, int byte_size);
|
||
|
||
// JIT(SSE2) は 256バイト単位で計算する
|
||
void galois_altmap256_change(unsigned char *data, unsigned int bsize);
|
||
void galois_altmap256_return(unsigned char *data, unsigned int bsize);
|
||
void checksum16_altmap256(unsigned char *data, unsigned char *hash, int byte_size);
|
||
void checksum16_return256(unsigned char *data, unsigned char *hash, int byte_size);
|
||
|
||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||
|
||
#define NW 65536
|
||
#define NWM1 65535
|
||
#define PRIM_POLY 0x1100B
|
||
|
||
// なぜかテーブルは 2バイト整数を使った方が速い
|
||
static unsigned short *galois_log_table = NULL;
|
||
static unsigned short *galois_exp_table;
|
||
|
||
int galois_create_table(void)
|
||
{
|
||
unsigned int j, b;
|
||
|
||
if (galois_log_table != NULL) return 0;
|
||
galois_log_table = _aligned_malloc(sizeof(unsigned short) * NW * 2, 64);
|
||
if (galois_log_table == NULL) return -1;
|
||
galois_exp_table = galois_log_table + NW; // 要素数は 65536個
|
||
|
||
b = 1;
|
||
for (j = 0; j < NWM1; j++){
|
||
galois_log_table[b] = (unsigned short)j;
|
||
galois_exp_table[j] = (unsigned short)b;
|
||
b = b << 1;
|
||
if (b & NW) b ^= PRIM_POLY;
|
||
}
|
||
galois_exp_table[NWM1] = galois_exp_table[0]; // copy for reduction (? mod NWM1)
|
||
|
||
// CPU によって使う関数を変更する
|
||
sse_unit = 16; // 16, 32, 64, 128 のどれでもいい (32のSSSE3は少し速い、GPUが識別するのに注意)
|
||
galois_align_multiply = galois_align16_multiply;
|
||
galois_align_multiply2 = NULL;
|
||
galois_altmap_change = galois_altmap_none;
|
||
galois_altmap_return = galois_altmap_none;
|
||
checksum16_altmap = checksum16;
|
||
checksum16_return = checksum16;
|
||
#ifndef NO_SIMD
|
||
if (cpu_flag & 256){ // AVX2, SSSE3, JIT(SSE2) の並び替えを使わない場合
|
||
// 将来的には AVX-512 などの命令に対応してもいい
|
||
//printf("\nWithout ALTMAP\n");
|
||
//sse_unit = 32;
|
||
} else if (cpu_flag & 16){ // AVX2 対応なら
|
||
//printf("\nUse AVX2 & ALTMAP\n");
|
||
sse_unit = 32; // 32, 64, 128 のどれでもいい
|
||
galois_align_multiply = galois_align32avx_multiply;
|
||
galois_align_multiply2 = galois_align32avx_multiply2;
|
||
galois_altmap_change = galois_altmap32_change;
|
||
galois_altmap_return = galois_altmap32_return;
|
||
checksum16_altmap = checksum16_altmap32;
|
||
checksum16_return = checksum16_return32;
|
||
} else if (cpu_flag & 1){ // SSSE3 対応なら
|
||
//printf("\nUse SSSE3 & ALTMAP\n");
|
||
sse_unit = 32; // 32, 64, 128 のどれでもいい
|
||
galois_align_multiply = galois_align32_multiply;
|
||
galois_align_multiply2 = galois_align32_multiply2;
|
||
galois_altmap_change = galois_altmap32_change;
|
||
galois_altmap_return = galois_altmap32_return;
|
||
checksum16_altmap = checksum16_altmap32;
|
||
checksum16_return = checksum16_return32;
|
||
} else { // SSSE3 が利用できない場合
|
||
if ((cpu_flag & 128) && (jit_alloc() == 0)){ // JIT(SSE2) を使う
|
||
//printf("\nUse JIT(SSE2) & ALTMAP\n");
|
||
sse_unit = 256;
|
||
galois_align_multiply = galois_align256_multiply;
|
||
galois_align_multiply2 = NULL;
|
||
galois_altmap_change = galois_altmap256_change;
|
||
galois_altmap_return = galois_altmap256_return;
|
||
checksum16_altmap = checksum16_altmap256;
|
||
checksum16_return = checksum16_return256;
|
||
}
|
||
}
|
||
#endif
|
||
|
||
return 0;
|
||
}
|
||
|
||
unsigned short galois_multiply(int x, int y)
|
||
{
|
||
int sum;
|
||
|
||
if ((x == 0) || (y == 0)) return 0;
|
||
|
||
sum = galois_log_table[x] + galois_log_table[y]; // result is from 2 to NWM1 * 2
|
||
//if (sum >= NWM1) sum -= NWM1;
|
||
sum = (sum >> 16) + (sum & NWM1); // result is from 0 to NWM1
|
||
return galois_exp_table[sum];
|
||
}
|
||
|
||
// multiply when "y" is a fixed value
|
||
unsigned short galois_multiply_fix(int x, int log_y)
|
||
{
|
||
int sum;
|
||
|
||
if (x == 0) return 0;
|
||
|
||
sum = galois_log_table[x] + log_y; // result is from 2 to NWM1 * 2
|
||
sum = (sum >> 16) + (sum & NWM1); // result is from 0 to NWM1
|
||
return galois_exp_table[sum];
|
||
}
|
||
|
||
unsigned short galois_divide(int x, int y)
|
||
{
|
||
int sum;
|
||
|
||
if (y == 0) return NWM1; // 除算エラー
|
||
if (x == 0) return 0;
|
||
|
||
sum = galois_log_table[x] - galois_log_table[y];
|
||
if (sum < 0) sum += NWM1;
|
||
return galois_exp_table[sum];
|
||
}
|
||
|
||
// ガロア体上での乗数計算、x の y 乗
|
||
unsigned short galois_power(int x, int y)
|
||
{
|
||
unsigned int sum;
|
||
|
||
if (x == 0) return 0; // 0**y = 0
|
||
if (y == 0) return 1; // x**0 = 1
|
||
if (y == 1) return (unsigned short)x; // x**1 = x
|
||
|
||
sum = (unsigned int)(galois_log_table[x]) * (unsigned int)y; // result is from 1 to NWM1 * NWM1
|
||
//sum = sum % NWM1;
|
||
sum = (sum >> 16) + (sum & NWM1); // result is from 1 to NWM1 * 2
|
||
sum = (sum >> 16) + (sum & NWM1); // result is from 0 to NWM1
|
||
return galois_exp_table[sum];
|
||
}
|
||
|
||
// ガロア体上での逆数、1 / x
|
||
unsigned short galois_reciprocal(int x)
|
||
{
|
||
if (x == 0) return NWM1; // 除算エラー
|
||
return galois_exp_table[NWM1 - (int)(galois_log_table[x])];
|
||
}
|
||
|
||
void galois_free_table(void) // テーブルを解放するために追加
|
||
{
|
||
if (galois_log_table != NULL){
|
||
_aligned_free(galois_log_table);
|
||
galois_log_table = NULL;
|
||
#ifndef _WIN64 // 32-bit 版ならインライン・アセンブラを使う
|
||
if (((cpu_flag & 1) == 0) && ((cpu_flag & 128) == 0)) // SSSE3 を使わない場合、MMX の終了処理
|
||
_mm_empty();
|
||
#endif
|
||
// SSSE3 を使わない場合で、JIT(SSE2) を使った場合の終了処理
|
||
if (((cpu_flag & 1) == 0) && ((cpu_flag & 128) != 0))
|
||
jit_free();
|
||
}
|
||
}
|
||
|
||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||
// MMX functions are based on code by Paul Houle (paulhoule.com) March 22, 2008
|
||
|
||
#ifndef _WIN64 // 32-bit 版ならインライン・アセンブラを使う
|
||
|
||
// Processes block of data a multiple of 8 bytes long using SIMD (mmx) opcodes.
|
||
// The amount of data to process (bsize) must be a non-zero multiple of 8.
|
||
// Paul's original code was modified to calculate each 8-bytes and removed last shift by Yutaka Sawada
|
||
static void DoBlock8(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned int *pMtab)
|
||
{
|
||
__asm {
|
||
push ebp
|
||
mov ebx,bsize ;bytes to process (multiple of 8)
|
||
mov esi,input ;source
|
||
mov edi,output ;destination
|
||
mov ebp,pMtab ;combined multiplication table
|
||
|
||
mov eax,[esi] ;load 1st 8 source bytes
|
||
movd mm4,[esi+4]
|
||
|
||
sub ebx,8 ;reduce last 8-bytes from loop
|
||
add esi,ebx ;point to end of input/output
|
||
add edi,ebx
|
||
neg ebx ;convert byte size to count-up
|
||
|
||
lp8:
|
||
movzx edx,al
|
||
movzx ecx,ah
|
||
shr eax,16
|
||
movd mm0,[ebp+edx*4] ;order is [_][_][_][0]
|
||
movd mm1,[ebp+400h+ecx*4]
|
||
movzx edx,al
|
||
movzx ecx,ah
|
||
movd eax,mm4
|
||
movq mm4,[esi+ebx+8] ;read-ahead next 8 source bytes
|
||
movd mm2,[ebp+edx*4] ;order is [_][_][_][1]
|
||
movzx edx,al
|
||
movq mm5,[edi+ebx]
|
||
movd mm3,[ebp+400h+ecx*4]
|
||
movzx ecx,ah
|
||
shr eax,16
|
||
punpcklwd mm0,[ebp+edx*4] ;order is [_][_][2][0]
|
||
movzx edx,al
|
||
punpcklwd mm1,[ebp+400h+ecx*4]
|
||
movzx ecx,ah
|
||
punpcklwd mm2,[ebp+edx*4] ;order is [_][_][3][1]
|
||
pxor mm1,mm0
|
||
punpcklwd mm3,[ebp+400h+ecx*4]
|
||
pxor mm3,mm2
|
||
movd eax,mm4 ;prepare src bytes 3-0 for next loop
|
||
punpcklwd mm1,mm3 ;order is [3][2][1][0]
|
||
psrlq mm4,32 ;align src bytes 7-4 for next loop
|
||
pxor mm1,mm5
|
||
movq [edi+ebx],mm1
|
||
add ebx,8
|
||
jnz lp8
|
||
|
||
;no need to pre-read in last 8-bytes
|
||
movzx edx,al
|
||
movzx ecx,ah
|
||
shr eax,16
|
||
movd mm0,[ebp+edx*4] ;order is [_][_][_][0]
|
||
movd mm1,[ebp+400h+ecx*4]
|
||
movzx edx,al
|
||
movzx ecx,ah
|
||
movd eax,mm4
|
||
movd mm2,[ebp+edx*4] ;order is [_][_][_][1]
|
||
movzx edx,al
|
||
movq mm5,[edi+ebx]
|
||
movd mm3,[ebp+400h+ecx*4]
|
||
movzx ecx,ah
|
||
shr eax,16
|
||
punpcklwd mm0,[ebp+edx*4] ;order is [_][_][2][0]
|
||
movzx edx,al
|
||
punpcklwd mm1,[ebp+400h+ecx*4]
|
||
movzx ecx,ah
|
||
punpcklwd mm2,[ebp+edx*4] ;order is [_][_][3][1]
|
||
pxor mm1,mm0
|
||
punpcklwd mm3,[ebp+400h+ecx*4]
|
||
pxor mm3,mm2
|
||
punpcklwd mm1,mm3 ;order is [3][2][1][0]
|
||
pxor mm1,mm5
|
||
movq [edi+ebx],mm1
|
||
|
||
pop ebp
|
||
}
|
||
}
|
||
|
||
#endif
|
||
|
||
// calculate multiplication tables instantly
|
||
static void create_two_table(unsigned int *mtab, int factor){
|
||
int shift_table[8], i, j, sum;
|
||
|
||
// factor * 2の乗数を計算する
|
||
shift_table[0] = factor; // factor * 1
|
||
for (i = 1; i < 8; i++){
|
||
// if (factor & 0x8000){
|
||
// factor <<= 1;
|
||
// factor ^= 0x1100B;
|
||
// } else {
|
||
// factor <<= 1;
|
||
// }
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
shift_table[i] = factor; // factor * (2**i)
|
||
}
|
||
|
||
for (j = 0; j < 2; j++){
|
||
for (i = 0; i < 256; i += 32){
|
||
/*
|
||
sum = 0;
|
||
if (i & 32)
|
||
sum = shift_table[5];
|
||
if (i & 64)
|
||
sum ^= shift_table[6];
|
||
if (i & 128)
|
||
sum ^= shift_table[7];
|
||
*/
|
||
sum = shift_table[5] & ((i << 26) >> 31);
|
||
sum ^= shift_table[6] & ((i << 25) >> 31);
|
||
sum ^= shift_table[7] & ((i << 24) >> 31);
|
||
|
||
mtab[i ] = sum;
|
||
mtab[i + 1] = sum ^ shift_table[0];
|
||
mtab[i + 2] = sum ^ shift_table[1];
|
||
mtab[i + 3] = sum ^ shift_table[1] ^ shift_table[0];
|
||
mtab[i + 4] = sum ^ shift_table[2];
|
||
mtab[i + 5] = sum ^ shift_table[2] ^ shift_table[0];
|
||
mtab[i + 6] = sum ^ shift_table[2] ^ shift_table[1];
|
||
mtab[i + 7] = sum ^ shift_table[2] ^ shift_table[1] ^ shift_table[0];
|
||
mtab[i + 8] = sum ^ shift_table[3];
|
||
mtab[i + 9] = sum ^ shift_table[3] ^ shift_table[0];
|
||
mtab[i + 10] = sum ^ shift_table[3] ^ shift_table[1];
|
||
mtab[i + 11] = sum ^ shift_table[3] ^ shift_table[1] ^ shift_table[0];
|
||
mtab[i + 12] = sum ^ shift_table[3] ^ shift_table[2];
|
||
mtab[i + 13] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[0];
|
||
mtab[i + 14] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1];
|
||
mtab[i + 15] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1] ^ shift_table[0];
|
||
sum ^= shift_table[4];
|
||
mtab[i + 16] = sum;
|
||
mtab[i + 17] = sum ^ shift_table[0];
|
||
mtab[i + 18] = sum ^ shift_table[1];
|
||
mtab[i + 19] = sum ^ shift_table[1] ^ shift_table[0];
|
||
mtab[i + 20] = sum ^ shift_table[2];
|
||
mtab[i + 21] = sum ^ shift_table[2] ^ shift_table[0];
|
||
mtab[i + 22] = sum ^ shift_table[2] ^ shift_table[1];
|
||
mtab[i + 23] = sum ^ shift_table[2] ^ shift_table[1] ^ shift_table[0];
|
||
mtab[i + 24] = sum ^ shift_table[3];
|
||
mtab[i + 25] = sum ^ shift_table[3] ^ shift_table[0];
|
||
mtab[i + 26] = sum ^ shift_table[3] ^ shift_table[1];
|
||
mtab[i + 27] = sum ^ shift_table[3] ^ shift_table[1] ^ shift_table[0];
|
||
mtab[i + 28] = sum ^ shift_table[3] ^ shift_table[2];
|
||
mtab[i + 29] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[0];
|
||
mtab[i + 30] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1];
|
||
mtab[i + 31] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1] ^ shift_table[0];
|
||
}
|
||
|
||
for (i = 0; i < 8; i++){
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
shift_table[i] = factor; // factor * (2**i)
|
||
}
|
||
mtab += 256;
|
||
}
|
||
}
|
||
|
||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||
// The method of using SSSE3 is based on Plank's papar;
|
||
// "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions".
|
||
|
||
/*
|
||
static void create_eight_table_c(unsigned char *mtab, int factor)
|
||
{
|
||
int shift_table[16], i, sum;
|
||
|
||
// factor * 2の乗数を計算する
|
||
shift_table[0] = factor; // factor * 1
|
||
for (i = 1; i < 16; i++){
|
||
// if (factor & 0x8000){
|
||
// factor <<= 1;
|
||
// factor ^= 0x1100B;
|
||
// } else {
|
||
// factor <<= 1;
|
||
// }
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
shift_table[i] = factor; // factor * (2**i)
|
||
}
|
||
|
||
for (i = 0; i < 16; i += 4){ // 4-bit ごとに計算する
|
||
mtab[i * 8 ] = 0;
|
||
mtab[i * 8 + 16] = 0;
|
||
sum = shift_table[i];
|
||
mtab[i * 8 + 1] = (unsigned char)sum; // lower 8-bit
|
||
mtab[i * 8 + 17] = (unsigned char)(sum >> 8); // higher 8-bit
|
||
sum = shift_table[i + 1];
|
||
mtab[i * 8 + 2] = (unsigned char)sum;
|
||
mtab[i * 8 + 18] = (unsigned char)(sum >> 8);
|
||
sum = sum ^ shift_table[i];
|
||
mtab[i * 8 + 3] = (unsigned char)sum;
|
||
mtab[i * 8 + 19] = (unsigned char)(sum >> 8);
|
||
sum = shift_table[i + 2];
|
||
mtab[i * 8 + 4] = (unsigned char)sum;
|
||
mtab[i * 8 + 20] = (unsigned char)(sum >> 8);
|
||
sum = sum ^ shift_table[i];
|
||
mtab[i * 8 + 5] = (unsigned char)sum;
|
||
mtab[i * 8 + 21] = (unsigned char)(sum >> 8);
|
||
sum = shift_table[i + 2] ^ shift_table[i + 1];
|
||
mtab[i * 8 + 6] = (unsigned char)sum;
|
||
mtab[i * 8 + 22] = (unsigned char)(sum >> 8);
|
||
sum = sum ^ shift_table[i];
|
||
mtab[i * 8 + 7] = (unsigned char)sum;
|
||
mtab[i * 8 + 23] = (unsigned char)(sum >> 8);
|
||
sum = shift_table[i + 3];
|
||
mtab[i * 8 + 8] = (unsigned char)sum;
|
||
mtab[i * 8 + 24] = (unsigned char)(sum >> 8);
|
||
sum = sum ^ shift_table[i];
|
||
mtab[i * 8 + 9] = (unsigned char)sum;
|
||
mtab[i * 8 + 25] = (unsigned char)(sum >> 8);
|
||
sum = shift_table[i + 3] ^ shift_table[i + 1];
|
||
mtab[i * 8 + 10] = (unsigned char)sum;
|
||
mtab[i * 8 + 26] = (unsigned char)(sum >> 8);
|
||
sum = sum ^ shift_table[i];
|
||
mtab[i * 8 + 11] = (unsigned char)sum;
|
||
mtab[i * 8 + 27] = (unsigned char)(sum >> 8);
|
||
sum = shift_table[i + 3] ^ shift_table[i + 2];
|
||
mtab[i * 8 + 12] = (unsigned char)sum;
|
||
mtab[i * 8 + 28] = (unsigned char)(sum >> 8);
|
||
sum = sum ^ shift_table[i];
|
||
mtab[i * 8 + 13] = (unsigned char)sum;
|
||
mtab[i * 8 + 29] = (unsigned char)(sum >> 8);
|
||
sum = shift_table[i + 3] ^ shift_table[i + 2] ^ shift_table[i + 1];
|
||
mtab[i * 8 + 14] = (unsigned char)sum;
|
||
mtab[i * 8 + 30] = (unsigned char)(sum >> 8);
|
||
sum = sum ^ shift_table[i];
|
||
mtab[i * 8 + 15] = (unsigned char)sum;
|
||
mtab[i * 8 + 31] = (unsigned char)(sum >> 8);
|
||
}
|
||
}
|
||
*/
|
||
|
||
#ifndef _WIN64 // 32-bit 版ならインライン・アセンブラを使う
|
||
|
||
// tables for split four combined multiplication
|
||
static void create_eight_table(unsigned char *mtab, int factor)
|
||
{
|
||
__asm { // This implementation requires SSE2
|
||
mov eax, factor
|
||
mov edx, mtab
|
||
mov ecx, -128
|
||
|
||
; create mask for 8-bit
|
||
pcmpeqw xmm7, xmm7 ; 0xFFFF *8
|
||
psrlw xmm7, 8 ; 0x00FF *8
|
||
|
||
lp32:
|
||
; factor * 1, *2, *4, *8
|
||
movd xmm0, eax ; [_][_][_][_][_][_][_][1]
|
||
pxor xmm1, xmm1
|
||
|
||
movsx ebx, ax
|
||
sar ebx, 31
|
||
shl eax, 1
|
||
and ebx, 0x1100B
|
||
xor eax, ebx
|
||
pinsrw xmm1, eax, 1 ; [_][_][_][_][_][_][2][_]
|
||
pxor xmm2, xmm2
|
||
|
||
movsx ebx, ax
|
||
sar ebx, 31
|
||
shl eax, 1
|
||
and ebx, 0x1100B
|
||
xor eax, ebx
|
||
pinsrw xmm2, eax, 4 ; [_][_][_][4][_][_][_][_]
|
||
punpcklwd xmm1, xmm1 ; [_][_][_][_][2][2][_][_]
|
||
|
||
movsx ebx, ax
|
||
sar ebx, 31
|
||
shl eax, 1
|
||
and ebx, 0x1100B
|
||
xor eax, ebx
|
||
movd xmm3, eax ; [_][_][_][_][_][_][_][8]
|
||
|
||
pshuflw xmm0, xmm0, 17 ; [_][_][_][_][1][_][1][_]
|
||
punpcklwd xmm3, xmm3 ; [_][_][_][_][_][_][8][8]
|
||
pxor xmm0, xmm1 ; [_][_][_][_][3][2][1][_]
|
||
pshufhw xmm2, xmm2, 0 ; [4][4][4][4][_][_][_][_]
|
||
punpcklqdq xmm0, xmm0 ; [3][2][1][_][3][2][1][_]
|
||
pshufd xmm3, xmm3, 0 ; [8][8][8][8][8][8][8][8]
|
||
pxor xmm2, xmm0 ; [7][6][5][4][3][2][1][_]
|
||
pxor xmm3, xmm2 ; [15][14][13][12][11][10][9][8]
|
||
|
||
movdqa xmm0, xmm2
|
||
movdqa xmm1, xmm3
|
||
pand xmm0, xmm7
|
||
pand xmm1, xmm7
|
||
packuswb xmm0, xmm1 ; lower 8-bit * 16
|
||
psrlw xmm2, 8
|
||
psrlw xmm3, 8
|
||
packuswb xmm2, xmm3 ; higher 8-bit * 16
|
||
|
||
movdqa [edx+128+ecx], xmm0
|
||
movdqa [edx+144+ecx], xmm2
|
||
|
||
; for next loop
|
||
movsx ebx, ax ; move with sign of word
|
||
sar ebx, 31
|
||
shl eax, 1
|
||
and ebx, 0x1100B
|
||
xor eax, ebx
|
||
|
||
add ecx, 32
|
||
jnz lp32
|
||
}
|
||
}
|
||
|
||
// VC2008 は SSSE3 をインライン・アセンブラで使える
|
||
// Address (input) does not need be 16-byte aligned
|
||
static void gf16_ssse3_block16u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__asm {
|
||
mov ecx, input ; source
|
||
mov edx, output ; destination
|
||
mov eax, bsize ; bytes to process (multiple of 16)
|
||
mov ebx, table ; multiplication table
|
||
|
||
; create mask for 8 entries
|
||
pcmpeqw xmm7, xmm7 ; 0xFFFF *8
|
||
psrlw xmm7, 12 ; 0x000F *8
|
||
|
||
add ecx, eax ; point to end of input/output
|
||
add edx, eax
|
||
neg eax ; convert byte size to count-up
|
||
|
||
lp16:
|
||
movdqu xmm0, [ecx+eax] ; read source 16-bytes
|
||
movdqa xmm2, [edx+eax]
|
||
|
||
movdqa xmm3, [ebx] ; low table
|
||
movdqa xmm4, [ebx+16] ; high table
|
||
movdqa xmm1, xmm0 ; copy source
|
||
psrlw xmm0, 4 ; prepare next 4-bit
|
||
pand xmm1, xmm7 ; src & 0x000F
|
||
pshufb xmm3, xmm1 ; table look-up
|
||
psllw xmm1, 8 ; shift 8-bit for higher table
|
||
pshufb xmm4, xmm1
|
||
movdqa xmm5, [ebx+32] ; low table
|
||
movdqa xmm6, [ebx+48] ; high table
|
||
pxor xmm3, xmm4 ; combine high and low
|
||
pxor xmm2, xmm3
|
||
|
||
movdqa xmm1, xmm0 ; copy source
|
||
psrlw xmm0, 4 ; prepare next 4-bit
|
||
pand xmm1, xmm7 ; (src >> 4) & 0x000F
|
||
pshufb xmm5, xmm1 ; table look-up
|
||
psllw xmm1, 8 ; shift 8-bit for higher table
|
||
pshufb xmm6, xmm1
|
||
movdqa xmm3, [ebx+64] ; low table
|
||
movdqa xmm4, [ebx+80] ; high table
|
||
pxor xmm5, xmm6 ; combine high and low
|
||
pxor xmm2, xmm5
|
||
|
||
movdqa xmm1, xmm0 ; copy source
|
||
psrlw xmm0, 4 ; prepare next 4-bit
|
||
pand xmm1, xmm7 ; (src >> 8) & 0x000F
|
||
pshufb xmm3, xmm1 ; table look-up
|
||
psllw xmm1, 8 ; shift 8-bit for higher table
|
||
pshufb xmm4, xmm1
|
||
movdqa xmm5, [ebx+96] ; low table
|
||
movdqa xmm6, [ebx+112] ; high table
|
||
pxor xmm3, xmm4 ; combine high and low
|
||
pxor xmm2, xmm3
|
||
|
||
pshufb xmm5, xmm0 ; table look-up
|
||
psllw xmm0, 8 ; shift 8-bit for higher table
|
||
pshufb xmm6, xmm0
|
||
pxor xmm5, xmm6 ; combine high and low
|
||
pxor xmm2, xmm5
|
||
|
||
movdqa [edx+eax], xmm2
|
||
|
||
add eax, 16
|
||
jnz lp16
|
||
}
|
||
}
|
||
|
||
// その場で 32バイトごとに並び替えて、計算後に戻す方法(50% faster than 16-byte version)
|
||
// Address (input) does not need be 16-byte aligned
|
||
static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__asm {
|
||
mov ecx, input ; source
|
||
mov edx, output ; destination
|
||
mov eax, bsize ; bytes to process (multiple of 32)
|
||
mov ebx, table ; multiplication table
|
||
|
||
; create mask for 16 entries
|
||
pcmpeqw xmm7, xmm7 ; 0xFFFF *8
|
||
pcmpeqw xmm6, xmm6 ; 0xFFFF *8
|
||
psrlw xmm7, 12 ; 0x000F *8
|
||
psrlw xmm6, 8 ; 0x00FF *8
|
||
packuswb xmm7, xmm7 ; 0x0F *16
|
||
|
||
add ecx, eax ; point to end of input/output
|
||
add edx, eax
|
||
neg eax ; convert byte size to count-up
|
||
|
||
lp32:
|
||
movdqu xmm0, [ecx+eax ] ; read source 32-bytes
|
||
movdqu xmm2, [ecx+eax+16]
|
||
movdqa xmm1, xmm0 ; copy source
|
||
movdqa xmm3, xmm2
|
||
pand xmm0, xmm6 ; erase higher byte
|
||
pand xmm2, xmm6
|
||
psrlw xmm1, 8 ; move higher byte to lower
|
||
psrlw xmm3, 8
|
||
packuswb xmm0, xmm2 ; select lower byte of each word
|
||
packuswb xmm1, xmm3 ; select higher byte of each word
|
||
|
||
movdqa xmm4, [ebx] ; low table
|
||
movdqa xmm5, [ebx+16] ; high table
|
||
movdqa xmm3, xmm0 ; copy source
|
||
psrlw xmm0, 4 ; prepare next 4-bit
|
||
pand xmm3, xmm7 ; src & 0x0F
|
||
pand xmm0, xmm7 ; (src >> 4) & 0x0F
|
||
pshufb xmm4, xmm3 ; table look-up
|
||
pshufb xmm5, xmm3
|
||
|
||
movdqa xmm2, [ebx+32] ; low table
|
||
movdqa xmm3, [ebx+48] ; high table
|
||
pshufb xmm2, xmm0 ; table look-up
|
||
pshufb xmm3, xmm0
|
||
pxor xmm4, xmm2 ; combine result
|
||
pxor xmm5, xmm3
|
||
|
||
movdqa xmm2, [ebx+64] ; low table
|
||
movdqa xmm3, [ebx+80] ; high table
|
||
movdqa xmm0, xmm1 ; copy source
|
||
psrlw xmm1, 4 ; prepare next 4-bit
|
||
pand xmm0, xmm7 ; src & 0x0F
|
||
pand xmm1, xmm7 ; (src >> 4) & 0x0F
|
||
pshufb xmm2, xmm0 ; table look-up
|
||
pshufb xmm3, xmm0
|
||
pxor xmm4, xmm2 ; combine result
|
||
pxor xmm5, xmm3
|
||
|
||
movdqa xmm2, [ebx+96] ; low table
|
||
movdqa xmm3, [ebx+112] ; high table
|
||
pshufb xmm2, xmm1 ; table look-up
|
||
pshufb xmm3, xmm1
|
||
pxor xmm4, xmm2 ; combine result
|
||
pxor xmm5, xmm3
|
||
|
||
movdqa xmm0, [edx+eax] ; read dest 32-bytes
|
||
movdqa xmm1, [edx+eax+16]
|
||
movdqa xmm3, xmm4 ; copy result
|
||
punpcklbw xmm3, xmm5 ; interleave lower and higher bytes
|
||
punpckhbw xmm4, xmm5
|
||
pxor xmm0, xmm3
|
||
pxor xmm1, xmm4
|
||
movdqa [edx+eax], xmm0 ; write dest 32-bytes
|
||
movdqa [edx+eax+16], xmm1
|
||
|
||
add eax, 32
|
||
jnz lp32
|
||
}
|
||
}
|
||
|
||
// 先に 32バイトごとに並び替えてあるデータを扱う方法
|
||
static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__asm {
|
||
mov ecx, input ; source
|
||
mov edx, output ; destination
|
||
mov eax, bsize ; bytes to process (multiple of 32)
|
||
mov ebx, table ; multiplication table
|
||
|
||
; create mask for 16 entries
|
||
pcmpeqw xmm7, xmm7 ; 0xFFFF *8
|
||
psrlw xmm7, 12 ; 0x000F *8
|
||
packuswb xmm7, xmm7 ; 0x0F *16
|
||
|
||
add ecx, eax ; point to end of input/output
|
||
add edx, eax
|
||
neg eax ; convert byte size to count-up
|
||
|
||
lp32:
|
||
movdqa xmm0, [ecx+eax] ; read source 32-bytes
|
||
movdqa xmm1, [ecx+eax+16]
|
||
|
||
movdqa xmm4, [ebx] ; low table
|
||
movdqa xmm5, [ebx+16] ; high table
|
||
movdqa xmm3, xmm0 ; copy source
|
||
psrlw xmm0, 4 ; prepare next 4-bit
|
||
pand xmm3, xmm7 ; src & 0x0F
|
||
pand xmm0, xmm7 ; (src >> 4) & 0x0F
|
||
pshufb xmm4, xmm3 ; table look-up
|
||
pshufb xmm5, xmm3
|
||
|
||
movdqa xmm2, [ebx+32] ; low table
|
||
movdqa xmm3, [ebx+48] ; high table
|
||
pshufb xmm2, xmm0 ; table look-up
|
||
pshufb xmm3, xmm0
|
||
pxor xmm2, xmm4 ; combine result
|
||
pxor xmm3, xmm5
|
||
|
||
movdqa xmm4, [ebx+64] ; low table
|
||
movdqa xmm5, [ebx+80] ; high table
|
||
movdqa xmm0, xmm1 ; copy source
|
||
psrlw xmm0, 4 ; prepare next 4-bit
|
||
pand xmm1, xmm7 ; src & 0x0F
|
||
pand xmm0, xmm7 ; (src >> 4) & 0x0F
|
||
pshufb xmm4, xmm1 ; table look-up
|
||
pshufb xmm5, xmm1
|
||
pxor xmm4, xmm2 ; combine result
|
||
pxor xmm5, xmm3
|
||
|
||
movdqa xmm2, [ebx+96] ; low table
|
||
movdqa xmm3, [ebx+112] ; high table
|
||
pshufb xmm2, xmm0 ; table look-up
|
||
pshufb xmm3, xmm0
|
||
|
||
movdqa xmm0, [edx+eax] ; read dest 32-bytes
|
||
movdqa xmm1, [edx+eax+16]
|
||
pxor xmm4, xmm2 ; combine result
|
||
pxor xmm5, xmm3
|
||
pxor xmm4, xmm0
|
||
pxor xmm5, xmm1
|
||
movdqa [edx+eax], xmm4 ; write dest 32-bytes
|
||
movdqa [edx+eax+16], xmm5
|
||
|
||
add eax, 32
|
||
jnz lp32
|
||
}
|
||
}
|
||
|
||
#else // 64-bit 版ではインライン・アセンブラを使えない
|
||
// (__m128i *) で逐次ポインターをキャスト変換するよりも、
|
||
// 先に __m128i* で定義しておいた方が、連続した領域へのアクセス最適化がうまくいく?
|
||
// ほとんど変わらない気がする(むしろ遅い?)・・・コンパイラ次第なのかも
|
||
|
||
// tables for split four combined multiplication
|
||
static void create_eight_table(unsigned char *mtab, int factor)
|
||
{
|
||
int count = 4;
|
||
__m128i *tbl;
|
||
__m128i xmm0, xmm1, xmm2, xmm3, mask;
|
||
|
||
tbl = (__m128i *)mtab;
|
||
|
||
// create mask for 8-bit
|
||
mask = _mm_setzero_si128();
|
||
mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8
|
||
mask = _mm_srli_epi16(mask, 8); // 0x00FF *8
|
||
|
||
while (1){
|
||
xmm0 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][1]
|
||
xmm1 = _mm_setzero_si128();
|
||
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
xmm1 = _mm_insert_epi16(xmm1, factor, 1); // [_][_][_][_][_][_][2][_]
|
||
xmm2 = _mm_setzero_si128();
|
||
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
xmm2 = _mm_insert_epi16(xmm2, factor, 4); // [_][_][_][4][_][_][_][_]
|
||
xmm1 = _mm_unpacklo_epi16(xmm1, xmm1); // [_][_][_][_][2][2][_][_]
|
||
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
xmm3 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][8]
|
||
|
||
xmm0 = _mm_shufflelo_epi16(xmm0, _MM_SHUFFLE(0, 1, 0, 1)); // [_][_][_][_][1][_][1][_]
|
||
xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); // [_][_][_][_][_][_][8][8]
|
||
xmm0 = _mm_xor_si128(xmm0, xmm1); // [_][_][_][_][3][2][1][_]
|
||
xmm2 = _mm_shufflehi_epi16(xmm2, _MM_SHUFFLE(0, 0, 0, 0)); // [4][4][4][4][_][_][_][_]
|
||
xmm0 = _mm_unpacklo_epi64(xmm0, xmm0); // [3][2][1][_][3][2][1][_]
|
||
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0, 0, 0, 0)); // [8][8][8][8][8][8][8][8]
|
||
xmm2 = _mm_xor_si128(xmm2, xmm0); // [7][6][5][4][3][2][1][_]
|
||
xmm3 = _mm_xor_si128(xmm3, xmm2); // [15][14][13][12][11][10][9][8]
|
||
|
||
xmm0 = _mm_load_si128(&xmm2);
|
||
xmm1 = _mm_load_si128(&xmm3);
|
||
xmm0 = _mm_and_si128(xmm0, mask);
|
||
xmm1 = _mm_and_si128(xmm1, mask);
|
||
xmm0 = _mm_packus_epi16(xmm0, xmm1); // lower 8-bit * 16
|
||
xmm2 = _mm_srli_epi16(xmm2, 8);
|
||
xmm3 = _mm_srli_epi16(xmm3, 8);
|
||
xmm2 = _mm_packus_epi16(xmm2, xmm3); // higher 8-bit * 16
|
||
|
||
_mm_store_si128(tbl , xmm0);
|
||
_mm_store_si128(tbl + 1, xmm2);
|
||
|
||
count--;
|
||
if (count == 0)
|
||
break;
|
||
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
tbl += 2;
|
||
}
|
||
}
|
||
|
||
// 16バイトごとに計算する方法、_mm_shuffle_epi8 の利用効率が悪い。
|
||
// Address (input) does not need be 16-byte aligned
|
||
static void gf16_ssse3_block16u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m128i *src, *dst, *tbl;
|
||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||
|
||
src = (__m128i *)input;
|
||
dst = (__m128i *)output;
|
||
tbl = (__m128i *)table;
|
||
|
||
// create mask for 8 entries
|
||
xmm7 = _mm_setzero_si128();
|
||
xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8
|
||
xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8
|
||
|
||
while (bsize != 0){
|
||
xmm0 = _mm_loadu_si128(src); // read source 16-bytes
|
||
xmm2 = _mm_load_si128(dst);
|
||
|
||
xmm3 = _mm_load_si128(tbl); // low table
|
||
xmm4 = _mm_load_si128(tbl + 1); // high table
|
||
xmm1 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x000F
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up
|
||
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
|
||
xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
|
||
xmm5 = _mm_load_si128(tbl + 2); // low table
|
||
xmm6 = _mm_load_si128(tbl + 3); // high table
|
||
xmm3 = _mm_xor_si128(xmm3, xmm4); // combine high and low
|
||
xmm2 = _mm_xor_si128(xmm2, xmm3);
|
||
|
||
xmm1 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x000F
|
||
xmm5 = _mm_shuffle_epi8(xmm5, xmm1); // table look-up
|
||
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
|
||
xmm6 = _mm_shuffle_epi8(xmm6, xmm1);
|
||
xmm3 = _mm_load_si128(tbl + 4); // low table
|
||
xmm4 = _mm_load_si128(tbl + 5); // high table
|
||
xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low
|
||
xmm2 = _mm_xor_si128(xmm2, xmm5);
|
||
|
||
xmm1 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x000F
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up
|
||
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
|
||
xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
|
||
xmm5 = _mm_load_si128(tbl + 6); // low table
|
||
xmm6 = _mm_load_si128(tbl + 7); // high table
|
||
xmm3 = _mm_xor_si128(xmm3, xmm4); // combine high and low
|
||
xmm2 = _mm_xor_si128(xmm2, xmm3);
|
||
|
||
xmm5 = _mm_shuffle_epi8(xmm5, xmm0); // table look-up
|
||
xmm0 = _mm_slli_epi16(xmm0, 8); // shift 8-bit for higher table
|
||
xmm6 = _mm_shuffle_epi8(xmm6, xmm0);
|
||
xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low
|
||
xmm2 = _mm_xor_si128(xmm2, xmm5);
|
||
|
||
_mm_store_si128(dst, xmm2);
|
||
|
||
src += 1;
|
||
dst += 1;
|
||
bsize -= 16;
|
||
}
|
||
}
|
||
|
||
// その場で 32バイトごとに並び替えて、計算後に戻す方法(50% faster than 16-byte version)
|
||
// なぜか asm を使わない方が速い!? 32-bit と 64-bit の両方で使える
|
||
// xmm レジスタを 8個までしか使わない方が 32-bit 版で速いし安定する
|
||
// Address (input) does not need be 16-byte aligned
|
||
static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||
__m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
|
||
|
||
// copy tables to local
|
||
tbl0 = _mm_load_si128((__m128i *)table);
|
||
tbl1 = _mm_load_si128((__m128i *)table + 1);
|
||
tbl2 = _mm_load_si128((__m128i *)table + 2);
|
||
tbl3 = _mm_load_si128((__m128i *)table + 3);
|
||
tbl4 = _mm_load_si128((__m128i *)table + 4);
|
||
tbl5 = _mm_load_si128((__m128i *)table + 5);
|
||
tbl6 = _mm_load_si128((__m128i *)table + 6);
|
||
tbl7 = _mm_load_si128((__m128i *)table + 7);
|
||
|
||
// create mask for 16 entries
|
||
xmm7 = _mm_setzero_si128();
|
||
xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8
|
||
xmm6 = _mm_srli_epi16(xmm7, 8); // 0x00FF *8
|
||
xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8
|
||
xmm7 = _mm_packus_epi16(xmm7, xmm7); // 0x0F *16
|
||
|
||
while (bsize != 0){
|
||
xmm1 = _mm_loadu_si128((__m128i *)input); // read source 32-bytes
|
||
xmm3 = _mm_loadu_si128((__m128i *)input + 1);
|
||
xmm0 = _mm_and_si128(xmm1, xmm6); // erase higher byte
|
||
xmm2 = _mm_and_si128(xmm3, xmm6);
|
||
xmm1 = _mm_srli_epi16(xmm1, 8); // move higher byte to lower
|
||
xmm3 = _mm_srli_epi16(xmm3, 8);
|
||
xmm0 = _mm_packus_epi16(xmm0, xmm2); // select lower byte of each word
|
||
xmm1 = _mm_packus_epi16(xmm1, xmm3); // select higher byte of each word
|
||
|
||
xmm4 = _mm_load_si128(&tbl0); // load tables
|
||
xmm5 = _mm_load_si128(&tbl1);
|
||
xmm3 = _mm_and_si128(xmm0, xmm7); // src & 0x0F
|
||
xmm0 = _mm_and_si128(_mm_srli_epi16(xmm0, 4), xmm7); // (src >> 4) & 0x0F
|
||
xmm4 = _mm_shuffle_epi8(xmm4, xmm3); // table look-up
|
||
xmm5 = _mm_shuffle_epi8(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(&tbl2); // load tables
|
||
xmm3 = _mm_load_si128(&tbl3);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(&tbl4); // load tables
|
||
xmm3 = _mm_load_si128(&tbl5);
|
||
xmm0 = _mm_and_si128(xmm1, xmm7); // src & 0x0F
|
||
xmm1 = _mm_and_si128(_mm_srli_epi16(xmm1, 4), xmm7); // (src >> 4) & 0x0F
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(&tbl6); // load tables
|
||
xmm3 = _mm_load_si128(&tbl7);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm0 = _mm_load_si128((__m128i *)output); // read dest 32-bytes
|
||
xmm1 = _mm_load_si128((__m128i *)output + 1);
|
||
xmm3 = _mm_unpacklo_epi8(xmm4, xmm5); // interleave lower and higher bytes
|
||
xmm4 = _mm_unpackhi_epi8(xmm4, xmm5);
|
||
xmm0 = _mm_xor_si128(xmm0, xmm3);
|
||
xmm1 = _mm_xor_si128(xmm1, xmm4);
|
||
_mm_store_si128((__m128i *)output, xmm0); // write dest 32-bytes
|
||
_mm_store_si128((__m128i *)output + 1, xmm1);
|
||
|
||
input += 32;
|
||
output += 32;
|
||
bsize -= 32;
|
||
}
|
||
}
|
||
|
||
// xmm レジスタにテーブルを読み込む方が 64-bit 版で微妙に速い
|
||
static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7;
|
||
__m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
|
||
|
||
// copy tables to local
|
||
tbl0 = _mm_load_si128((__m128i *)table);
|
||
tbl1 = _mm_load_si128((__m128i *)table + 1);
|
||
tbl2 = _mm_load_si128((__m128i *)table + 2);
|
||
tbl3 = _mm_load_si128((__m128i *)table + 3);
|
||
tbl4 = _mm_load_si128((__m128i *)table + 4);
|
||
tbl5 = _mm_load_si128((__m128i *)table + 5);
|
||
tbl6 = _mm_load_si128((__m128i *)table + 6);
|
||
tbl7 = _mm_load_si128((__m128i *)table + 7);
|
||
|
||
// create mask for 16 entries
|
||
xmm7 = _mm_setzero_si128();
|
||
xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8
|
||
xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8
|
||
xmm7 = _mm_packus_epi16(xmm7, xmm7); // 0x0F *16
|
||
|
||
while (bsize != 0){
|
||
xmm0 = _mm_load_si128((__m128i *)input); // read source 32-bytes
|
||
xmm1 = _mm_load_si128((__m128i *)input + 1);
|
||
|
||
xmm3 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm3 = _mm_and_si128(xmm3, xmm7); // src & 0x0F
|
||
xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F
|
||
|
||
xmm4 = _mm_load_si128(&tbl0); // load tables
|
||
xmm5 = _mm_load_si128(&tbl1);
|
||
xmm4 = _mm_shuffle_epi8(xmm4, xmm3); // table look-up
|
||
xmm5 = _mm_shuffle_epi8(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(&tbl2); // load tables
|
||
xmm3 = _mm_load_si128(&tbl3);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm0 = _mm_load_si128(&xmm1); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x0F
|
||
xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F
|
||
|
||
xmm2 = _mm_load_si128(&tbl4); // load tables
|
||
xmm3 = _mm_load_si128(&tbl5);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(&tbl6); // load tables
|
||
xmm3 = _mm_load_si128(&tbl7);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
|
||
xmm0 = _mm_load_si128((__m128i *)output); // read dest 32-bytes
|
||
xmm1 = _mm_load_si128((__m128i *)output + 1);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm0);
|
||
xmm5 = _mm_xor_si128(xmm5, xmm1);
|
||
_mm_store_si128((__m128i *)output, xmm4); // write dest 32-bytes
|
||
_mm_store_si128((__m128i *)output + 1, xmm5);
|
||
|
||
input += 32;
|
||
output += 32;
|
||
bsize -= 32;
|
||
}
|
||
}
|
||
|
||
/*
|
||
static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m128i *src, *dst, *tbl;
|
||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7;
|
||
|
||
src = (__m128i *)input;
|
||
dst = (__m128i *)output;
|
||
tbl = (__m128i *)table;
|
||
|
||
// create mask for 16 entries
|
||
xmm7 = _mm_setzero_si128();
|
||
xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8
|
||
xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8
|
||
xmm7 = _mm_packus_epi16(xmm7, xmm7); // 0x0F *16
|
||
|
||
while (bsize != 0){
|
||
xmm0 = _mm_load_si128(src); // read source 32-bytes
|
||
xmm1 = _mm_load_si128(src + 1);
|
||
|
||
xmm3 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm3 = _mm_and_si128(xmm3, xmm7); // src & 0x0F
|
||
xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F
|
||
|
||
xmm4 = _mm_load_si128(tbl); // load tables
|
||
xmm5 = _mm_load_si128(tbl + 1);
|
||
xmm4 = _mm_shuffle_epi8(xmm4, xmm3); // table look-up
|
||
xmm5 = _mm_shuffle_epi8(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(tbl + 2); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 3);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm0 = _mm_load_si128(&xmm1); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x0F
|
||
xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F
|
||
|
||
xmm2 = _mm_load_si128(tbl + 4); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 5);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(tbl + 6); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 7);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
|
||
xmm0 = _mm_load_si128(dst); // read dest 32-bytes
|
||
xmm1 = _mm_load_si128(dst + 1);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm0);
|
||
xmm5 = _mm_xor_si128(xmm5, xmm1);
|
||
_mm_store_si128(dst, xmm4); // write dest 32-bytes
|
||
_mm_store_si128(dst + 1, xmm5);
|
||
|
||
src += 2;
|
||
dst += 2;
|
||
bsize -= 32;
|
||
}
|
||
}
|
||
*/
|
||
|
||
#endif
|
||
|
||
// 逆行列計算用に掛け算だけする(XORで追加しない)
|
||
static void gf16_ssse3_block16s(unsigned char *data, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m128i dest, mask, xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
|
||
__m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
|
||
|
||
// copy tables to local
|
||
tbl0 = _mm_load_si128((__m128i *)table);
|
||
tbl1 = _mm_load_si128((__m128i *)table + 1);
|
||
tbl2 = _mm_load_si128((__m128i *)table + 2);
|
||
tbl3 = _mm_load_si128((__m128i *)table + 3);
|
||
tbl4 = _mm_load_si128((__m128i *)table + 4);
|
||
tbl5 = _mm_load_si128((__m128i *)table + 5);
|
||
tbl6 = _mm_load_si128((__m128i *)table + 6);
|
||
tbl7 = _mm_load_si128((__m128i *)table + 7);
|
||
|
||
// create mask for 8 entries
|
||
mask = _mm_setzero_si128();
|
||
mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8
|
||
mask = _mm_srli_epi16(mask, 12); // 0x000F *8
|
||
|
||
while (bsize != 0){
|
||
xmm0 = _mm_load_si128((__m128i *)data); // read source 16-bytes
|
||
|
||
xmm3 = _mm_load_si128(&tbl0); // low table
|
||
xmm4 = _mm_load_si128(&tbl1); // high table
|
||
xmm1 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x000F
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up
|
||
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
|
||
xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
|
||
xmm5 = _mm_load_si128(&tbl2); // low table
|
||
xmm6 = _mm_load_si128(&tbl3); // high table
|
||
dest = _mm_xor_si128(xmm3, xmm4); // combine high and low
|
||
|
||
xmm1 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x000F
|
||
xmm5 = _mm_shuffle_epi8(xmm5, xmm1); // table look-up
|
||
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
|
||
xmm6 = _mm_shuffle_epi8(xmm6, xmm1);
|
||
xmm3 = _mm_load_si128(&tbl4); // low table
|
||
xmm4 = _mm_load_si128(&tbl5); // high table
|
||
xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low
|
||
dest = _mm_xor_si128(dest, xmm5);
|
||
|
||
xmm1 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x000F
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up
|
||
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
|
||
xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
|
||
xmm5 = _mm_load_si128(&tbl6); // low table
|
||
xmm6 = _mm_load_si128(&tbl7); // high table
|
||
xmm3 = _mm_xor_si128(xmm3, xmm4); // combine high and low
|
||
dest = _mm_xor_si128(dest, xmm3);
|
||
|
||
xmm5 = _mm_shuffle_epi8(xmm5, xmm0); // table look-up
|
||
xmm0 = _mm_slli_epi16(xmm0, 8); // shift 8-bit for higher table
|
||
xmm6 = _mm_shuffle_epi8(xmm6, xmm0);
|
||
xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low
|
||
dest = _mm_xor_si128(dest, xmm5);
|
||
|
||
_mm_store_si128((__m128i *)data, dest);
|
||
|
||
data += 16;
|
||
bsize -= 16;
|
||
}
|
||
}
|
||
|
||
// 2ブロック同時に計算することで、メモリーへのアクセス回数を減らす
|
||
// 128バイトのテーブルを2個用意しておくこと
|
||
// xmm レジスタの数が足りないので、テーブルを毎回ロードする
|
||
static void gf16_ssse3_block32_altmap2(unsigned char *input1, unsigned char *input2, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m128i *tbl;
|
||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, mask;
|
||
|
||
tbl = (__m128i *)table;
|
||
|
||
// create mask for 16 entries
|
||
mask = _mm_setzero_si128();
|
||
mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8
|
||
mask = _mm_srli_epi16(mask, 12); // 0x000F *8
|
||
mask = _mm_packus_epi16(mask, mask); // 0x0F *16
|
||
|
||
while (bsize != 0){
|
||
xmm0 = _mm_load_si128((__m128i *)input1); // read source 32-bytes
|
||
xmm1 = _mm_load_si128((__m128i *)input1 + 1);
|
||
|
||
xmm6 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm6 = _mm_and_si128(xmm6, mask); // src & 0x0F
|
||
xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F
|
||
|
||
xmm4 = _mm_load_si128(tbl); // load tables
|
||
xmm5 = _mm_load_si128(tbl + 1);
|
||
xmm4 = _mm_shuffle_epi8(xmm4, xmm6); // table look-up
|
||
xmm5 = _mm_shuffle_epi8(xmm5, xmm6);
|
||
|
||
xmm2 = _mm_load_si128(tbl + 2); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 3);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm0 = _mm_load_si128(&xmm1); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x0F
|
||
xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F
|
||
|
||
xmm2 = _mm_load_si128(tbl + 4); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 5);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(tbl + 6); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 7);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm0 = _mm_load_si128((__m128i *)input2); // read source 32-bytes
|
||
xmm1 = _mm_load_si128((__m128i *)input2 + 1);
|
||
|
||
xmm6 = _mm_load_si128(&xmm0); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm6 = _mm_and_si128(xmm6, mask); // src & 0x0F
|
||
xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F
|
||
|
||
xmm2 = _mm_load_si128(tbl + 8); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 9);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm6); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm6);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(tbl + 10); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 11);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm0 = _mm_load_si128(&xmm1); // copy source
|
||
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
|
||
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x0F
|
||
xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F
|
||
|
||
xmm2 = _mm_load_si128(tbl + 12); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 13);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm2 = _mm_load_si128(tbl + 14); // load tables
|
||
xmm3 = _mm_load_si128(tbl + 15);
|
||
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
|
||
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
|
||
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
|
||
xmm5 = _mm_xor_si128(xmm5, xmm3);
|
||
|
||
xmm0 = _mm_load_si128((__m128i *)output); // read dest 32-bytes
|
||
xmm1 = _mm_load_si128((__m128i *)output + 1);
|
||
xmm0 = _mm_xor_si128(xmm0, xmm4);
|
||
xmm1 = _mm_xor_si128(xmm1, xmm5);
|
||
_mm_store_si128((__m128i *)output, xmm0); // write dest 32-bytes
|
||
_mm_store_si128((__m128i *)output + 1, xmm1);
|
||
|
||
input1 += 32;
|
||
input2 += 32;
|
||
output += 32;
|
||
bsize -= 32;
|
||
}
|
||
}
|
||
|
||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||
// AVX2 命令を使うには Windows 7 以降じゃないといけない
|
||
|
||
// _mm256_permute2x128_si256 の control の意味は以下を参照
|
||
// http://www.felixcloutier.com/x86/VPERM2I128.html
|
||
|
||
// AVX2 を使って全体を2倍していくと、13% ぐらい速くなる
|
||
// でも、テーブル作成が少し速くなっても、全体的な速度はほとんど変わらない・・・
|
||
static void create_eight_table_avx2(unsigned char *mtab, int factor)
|
||
{
|
||
int count;
|
||
__m128i xmm0, xmm1, xmm2, xmm3, mask8;
|
||
__m256i ymm0, ymm1, ymm2, ymm3, base, poly, mask16;
|
||
|
||
// create mask for 8-bit
|
||
mask8 = _mm_setzero_si128();
|
||
mask8 = _mm_cmpeq_epi16(mask8, mask8); // 0xFFFF *8
|
||
mask8 = _mm_srli_epi16(mask8, 8); // 0x00FF *8
|
||
|
||
xmm0 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][1]
|
||
xmm1 = _mm_setzero_si128();
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
xmm1 = _mm_insert_epi16(xmm1, factor, 1); // [_][_][_][_][_][_][2][_]
|
||
xmm2 = _mm_setzero_si128();
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
xmm2 = _mm_insert_epi16(xmm2, factor, 4); // [_][_][_][4][_][_][_][_]
|
||
xmm1 = _mm_unpacklo_epi16(xmm1, xmm1); // [_][_][_][_][2][2][_][_]
|
||
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
|
||
xmm3 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][8]
|
||
|
||
xmm0 = _mm_shufflelo_epi16(xmm0, _MM_SHUFFLE(0, 1, 0, 1)); // [_][_][_][_][1][_][1][_]
|
||
xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); // [_][_][_][_][_][_][8][8]
|
||
xmm0 = _mm_xor_si128(xmm0, xmm1); // [_][_][_][_][3][2][1][_]
|
||
xmm2 = _mm_shufflehi_epi16(xmm2, _MM_SHUFFLE(0, 0, 0, 0)); // [4][4][4][4][_][_][_][_]
|
||
xmm0 = _mm_unpacklo_epi64(xmm0, xmm0); // [3][2][1][_][3][2][1][_]
|
||
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0, 0, 0, 0)); // [8][8][8][8][8][8][8][8]
|
||
xmm2 = _mm_xor_si128(xmm2, xmm0); // [7][6][5][4][3][2][1][_]
|
||
xmm3 = _mm_xor_si128(xmm3, xmm2); // [15][14][13][12][11][10][9][8]
|
||
|
||
// 途中で AVX2 命令を使っても遅くならないっぽい
|
||
poly = _mm256_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B * 16
|
||
mask16 = _mm256_cmpeq_epi16(poly, poly);
|
||
mask16 = _mm256_srli_epi16(mask16, 8); // 0x00FF *16
|
||
base = _mm256_setzero_si256();
|
||
base = _mm256_inserti128_si256(base, xmm2, 0);
|
||
base = _mm256_inserti128_si256(base, xmm3, 1);
|
||
|
||
// ymm レジスタに読み込んでる間にメモリーに書き込んだ方が速い
|
||
xmm0 = _mm_and_si128(xmm2, mask8);
|
||
xmm1 = _mm_and_si128(xmm3, mask8);
|
||
xmm0 = _mm_packus_epi16(xmm0, xmm1); // lower 8-bit * 16
|
||
xmm2 = _mm_srli_epi16(xmm2, 8);
|
||
xmm3 = _mm_srli_epi16(xmm3, 8);
|
||
xmm2 = _mm_packus_epi16(xmm2, xmm3); // higher 8-bit * 16
|
||
_mm_store_si128((__m128i *)mtab , xmm0);
|
||
_mm_store_si128((__m128i *)mtab + 1, xmm2);
|
||
|
||
for (count = 1; count < 4; count++){
|
||
// 全体を2倍する
|
||
ymm0 = _mm256_slli_epi16(base, 1);
|
||
ymm1 = _mm256_srai_epi16(base, 15);
|
||
ymm1 = _mm256_and_si256(ymm1, poly);
|
||
base = _mm256_xor_si256(ymm1, ymm0);
|
||
|
||
// 全体を2倍する
|
||
ymm0 = _mm256_slli_epi16(base, 1);
|
||
ymm1 = _mm256_srai_epi16(base, 15);
|
||
ymm1 = _mm256_and_si256(ymm1, poly);
|
||
base = _mm256_xor_si256(ymm1, ymm0);
|
||
|
||
// 全体を2倍する
|
||
ymm0 = _mm256_slli_epi16(base, 1);
|
||
ymm1 = _mm256_srai_epi16(base, 15);
|
||
ymm1 = _mm256_and_si256(ymm1, poly);
|
||
base = _mm256_xor_si256(ymm1, ymm0);
|
||
|
||
// 全体を2倍する
|
||
ymm0 = _mm256_slli_epi16(base, 1);
|
||
ymm1 = _mm256_srai_epi16(base, 15);
|
||
ymm1 = _mm256_and_si256(ymm1, poly);
|
||
base = _mm256_xor_si256(ymm1, ymm0);
|
||
|
||
// 並び替えて保存する
|
||
ymm0 = _mm256_and_si256(base, mask16); // lower 8-bit * 16
|
||
ymm1 = _mm256_srli_epi16(base, 8); // higher 8-bit * 16
|
||
ymm2 = _mm256_permute2x128_si256(ymm0, ymm1, 0x20);
|
||
ymm3 = _mm256_permute2x128_si256(ymm0, ymm1, 0x31);
|
||
ymm0 = _mm256_packus_epi16(ymm2, ymm3);
|
||
_mm256_store_si256((__m256i *)mtab + count, ymm0);
|
||
}
|
||
|
||
// AVX-SSE 切り替えの回避
|
||
_mm256_zeroupper();
|
||
}
|
||
|
||
// 逆行列計算用に掛け算だけする(XORで追加しない)
|
||
static void gf16_avx2_block32s(unsigned char *data, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
|
||
__m256i mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
|
||
|
||
// copy tables to local
|
||
tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
|
||
tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
|
||
tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
|
||
tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
|
||
|
||
// split to 8 tables
|
||
tbl0 = _mm256_permute2x128_si256(tmp0, tmp0, 0x00); // tbl0[low0][low0]
|
||
tbl1 = _mm256_permute2x128_si256(tmp1, tmp1, 0x00); // tbl1[low1][low1]
|
||
tbl2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x00); // tbl2[low2][low2]
|
||
tbl3 = _mm256_permute2x128_si256(tmp3, tmp3, 0x00); // tbl3[low3][low3]
|
||
tbl4 = _mm256_permute2x128_si256(tmp0, tmp0, 0x11); // tbl0[high0][high0]
|
||
tbl5 = _mm256_permute2x128_si256(tmp1, tmp1, 0x11); // tbl1[high1][high1]
|
||
tbl6 = _mm256_permute2x128_si256(tmp2, tmp2, 0x11); // tbl2[high2][high2]
|
||
tbl7 = _mm256_permute2x128_si256(tmp3, tmp3, 0x11); // tbl3[high3][high3]
|
||
|
||
// create mask for 16 entries
|
||
mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16
|
||
mask = _mm256_srli_epi16(mask, 12); // 0x000F *16
|
||
|
||
while (bsize != 0){
|
||
src0 = _mm256_load_si256((__m256i *)data); // read source 32-bytes
|
||
|
||
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
tmp0 = _mm256_shuffle_epi8(tbl0, src1); // table look-up
|
||
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
|
||
tmp1 = _mm256_shuffle_epi8(tbl4, src1);
|
||
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
dest = _mm256_xor_si256(tmp0, tmp1); // combine high and low
|
||
|
||
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
tmp0 = _mm256_shuffle_epi8(tbl1, src1); // table look-up
|
||
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
|
||
tmp1 = _mm256_shuffle_epi8(tbl5, src1);
|
||
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
|
||
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
dest = _mm256_xor_si256(dest, tmp1);
|
||
|
||
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
tmp0 = _mm256_shuffle_epi8(tbl2, src1); // table look-up
|
||
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
|
||
tmp1 = _mm256_shuffle_epi8(tbl6, src1);
|
||
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
|
||
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
dest = _mm256_xor_si256(dest, tmp1);
|
||
|
||
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
tmp0 = _mm256_shuffle_epi8(tbl3, src1); // table look-up
|
||
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
|
||
tmp1 = _mm256_shuffle_epi8(tbl7, src1);
|
||
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
|
||
dest = _mm256_xor_si256(dest, tmp1);
|
||
|
||
_mm256_store_si256((__m256i *)data, dest); // write dest 32-bytes
|
||
|
||
data += 32;
|
||
bsize -= 32;
|
||
}
|
||
|
||
// AVX-SSE 切り替えの回避
|
||
_mm256_zeroupper();
|
||
}
|
||
|
||
// 逆行列計算用に ALTMAP されてないソースにも対応しておく
|
||
// Address (input) does not need be 32-byte aligned
|
||
static void gf16_avx2_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
|
||
__m256i mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
|
||
|
||
// copy tables to local
|
||
tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
|
||
tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
|
||
tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
|
||
tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
|
||
|
||
// split to 8 tables
|
||
tbl0 = _mm256_permute2x128_si256(tmp0, tmp0, 0x00); // tbl0[low0][low0]
|
||
tbl1 = _mm256_permute2x128_si256(tmp1, tmp1, 0x00); // tbl1[low1][low1]
|
||
tbl2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x00); // tbl2[low2][low2]
|
||
tbl3 = _mm256_permute2x128_si256(tmp3, tmp3, 0x00); // tbl3[low3][low3]
|
||
tbl4 = _mm256_permute2x128_si256(tmp0, tmp0, 0x11); // tbl0[high0][high0]
|
||
tbl5 = _mm256_permute2x128_si256(tmp1, tmp1, 0x11); // tbl1[high1][high1]
|
||
tbl6 = _mm256_permute2x128_si256(tmp2, tmp2, 0x11); // tbl2[high2][high2]
|
||
tbl7 = _mm256_permute2x128_si256(tmp3, tmp3, 0x11); // tbl3[high3][high3]
|
||
|
||
// create mask for 16 entries
|
||
mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16
|
||
mask = _mm256_srli_epi16(mask, 12); // 0x000F *16
|
||
|
||
while (bsize != 0){
|
||
src0 = _mm256_loadu_si256((__m256i *)input); // read source 32-bytes
|
||
dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
|
||
|
||
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
tmp0 = _mm256_shuffle_epi8(tbl0, src1); // table look-up
|
||
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
|
||
tmp1 = _mm256_shuffle_epi8(tbl4, src1);
|
||
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
|
||
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
dest = _mm256_xor_si256(dest, tmp1);
|
||
|
||
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
tmp0 = _mm256_shuffle_epi8(tbl1, src1); // table look-up
|
||
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
|
||
tmp1 = _mm256_shuffle_epi8(tbl5, src1);
|
||
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
|
||
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
dest = _mm256_xor_si256(dest, tmp1);
|
||
|
||
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
tmp0 = _mm256_shuffle_epi8(tbl2, src1); // table look-up
|
||
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
|
||
tmp1 = _mm256_shuffle_epi8(tbl6, src1);
|
||
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
|
||
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
dest = _mm256_xor_si256(dest, tmp1);
|
||
|
||
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
tmp0 = _mm256_shuffle_epi8(tbl3, src1); // table look-up
|
||
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
|
||
tmp1 = _mm256_shuffle_epi8(tbl7, src1);
|
||
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
|
||
dest = _mm256_xor_si256(dest, tmp1);
|
||
|
||
_mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes
|
||
|
||
input += 32;
|
||
output += 32;
|
||
bsize -= 32;
|
||
}
|
||
|
||
// AVX-SSE 切り替えの回避
|
||
_mm256_zeroupper();
|
||
}
|
||
|
||
// テーブルを並び替えて使えば、ループ内の並び替え回数を一回に減らせる
|
||
static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
|
||
|
||
// copy tables to local
|
||
tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
|
||
tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
|
||
tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
|
||
tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
|
||
|
||
// re-arrange table order (permute より blend の方が速いらしい)
|
||
tbl0 = _mm256_blend_epi32(tmp0, tmp2, 0xF0); // tbl0[low0][high2] <- 0x0f[lo][hi]
|
||
tbl1 = _mm256_blend_epi32(tmp1, tmp3, 0xF0); // tbl1[low1][high3] <- 0xf0[lo][hi]
|
||
tbl2 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03); // tbl2[high0][low2] <- 0x0f[lo][hi]
|
||
tbl3 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03); // tbl3[high1][low3] <- 0xf0[lo][hi]
|
||
|
||
// create mask for 32 entries
|
||
mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16
|
||
mask = _mm256_srli_epi16(mask, 12); // 0x000F *16
|
||
mask = _mm256_packus_epi16(mask, mask); // 0x0F *32
|
||
|
||
while (bsize != 0){
|
||
src0 = _mm256_load_si256((__m256i *)input); // read source 32-bytes
|
||
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
|
||
|
||
tmp0 = _mm256_shuffle_epi8(tbl0, src0); // table look-up
|
||
tmp1 = _mm256_shuffle_epi8(tbl1, src1);
|
||
tmp2 = _mm256_shuffle_epi8(tbl2, src0);
|
||
tmp3 = _mm256_shuffle_epi8(tbl3, src1);
|
||
|
||
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
|
||
tmp2 = _mm256_xor_si256(tmp2, tmp3);
|
||
tmp2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x01); // exchange low & high 128-bit
|
||
|
||
dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
|
||
tmp0 = _mm256_xor_si256(tmp0, tmp2);
|
||
dest = _mm256_xor_si256(dest, tmp0);
|
||
_mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes
|
||
|
||
input += 32;
|
||
output += 32;
|
||
bsize -= 32;
|
||
}
|
||
|
||
// AVX-SSE 切り替えの回避
|
||
_mm256_zeroupper();
|
||
}
|
||
|
||
/*
|
||
// テーブルを並び替えて使えば、ループ内の並び替え回数を減らせる
|
||
static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, src2, src3;
|
||
|
||
// copy tables to local
|
||
src0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
|
||
src1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
|
||
src2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
|
||
src3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
|
||
|
||
// re-arrange table order
|
||
tbl0 = _mm256_permute2x128_si256(src0, src2, 0x30); // tblA[low0][high2] <- 0x0f[lo][hi]
|
||
tbl1 = _mm256_permute2x128_si256(src1, src3, 0x30); // tblB[low1][high3] <- 0xf0[lo][hi]
|
||
tbl2 = _mm256_permute2x128_si256(src2, src0, 0x30); // tblC[low2][high0] <- 0x0f[hi][lo]
|
||
tbl3 = _mm256_permute2x128_si256(src3, src1, 0x30); // tblD[low3][high1] <- 0xf0[hi][lo]
|
||
|
||
// create mask for 32 entries
|
||
mask = _mm256_set1_epi8(0x0F); // 0x0F *32
|
||
|
||
while (bsize != 0){
|
||
src0 = _mm256_load_si256((__m256i *)input); // read source 32-bytes
|
||
dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
|
||
|
||
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
|
||
|
||
src2 = _mm256_permute2x128_si256(src0, src0, 0x01); // exchange low & high 128-bit of "src & 0x0F"
|
||
src3 = _mm256_permute2x128_si256(src1, src1, 0x01); // exchange low & high 128-bit of "(src >> 4) & 0x0F"
|
||
|
||
src0 = _mm256_shuffle_epi8(tbl0, src0); // table look-up
|
||
src1 = _mm256_shuffle_epi8(tbl1, src1);
|
||
src2 = _mm256_shuffle_epi8(tbl2, src2);
|
||
src3 = _mm256_shuffle_epi8(tbl3, src3);
|
||
|
||
src0 = _mm256_xor_si256(src0, src1); // combine result
|
||
src2 = _mm256_xor_si256(src2, src3);
|
||
dest = _mm256_xor_si256(dest, src0);
|
||
dest = _mm256_xor_si256(dest, src2);
|
||
_mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes
|
||
|
||
input += 32;
|
||
output += 32;
|
||
bsize -= 32;
|
||
}
|
||
|
||
// AVX-SSE 切り替えの回避
|
||
_mm256_zeroupper();
|
||
}
|
||
*/
|
||
|
||
/*
|
||
// レジスタを大量に使って依存関係をなくせば並列処理できるかも?
|
||
static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
|
||
|
||
// copy tables to local
|
||
tbl0 = _mm256_load_si256((__m256i *)table);
|
||
tbl1 = _mm256_load_si256((__m256i *)table + 1);
|
||
tbl2 = _mm256_load_si256((__m256i *)table + 2);
|
||
tbl3 = _mm256_load_si256((__m256i *)table + 3);
|
||
|
||
// create mask for 32 entries
|
||
mask = _mm256_set1_epi8(0x0F); // 0x0F *32
|
||
|
||
while (bsize != 0){
|
||
src0 = _mm256_load_si256((__m256i *)input); // read source 32-bytes
|
||
dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
|
||
|
||
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
|
||
|
||
tmp0 = _mm256_permute2x128_si256(src0, src0, 0x00); // copy low 128-bit to high from "src & 0x0F"
|
||
tmp1 = _mm256_permute2x128_si256(src1, src1, 0x00); // copy low 128-bit to high from "(src >> 4) & 0x0F"
|
||
tmp2 = _mm256_permute2x128_si256(src0, src0, 0x11); // copy high 128-bit to low from "src & 0x0F"
|
||
tmp3 = _mm256_permute2x128_si256(src1, src1, 0x11); // copy high 128-bit to low from "(src >> 4) & 0x0F"
|
||
|
||
tmp0 = _mm256_shuffle_epi8(tbl0, tmp0); // table look-up
|
||
tmp1 = _mm256_shuffle_epi8(tbl1, tmp1);
|
||
tmp2 = _mm256_shuffle_epi8(tbl2, tmp2);
|
||
tmp3 = _mm256_shuffle_epi8(tbl3, tmp3);
|
||
|
||
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
|
||
tmp2 = _mm256_xor_si256(tmp2, tmp3);
|
||
dest = _mm256_xor_si256(dest, tmp0);
|
||
dest = _mm256_xor_si256(dest, tmp2);
|
||
_mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes
|
||
|
||
input += 32;
|
||
output += 32;
|
||
bsize -= 32;
|
||
}
|
||
|
||
// AVX-SSE 切り替えの回避
|
||
_mm256_zeroupper();
|
||
}
|
||
*/
|
||
|
||
// 2ブロック同時に計算することで、メモリーへのアクセス回数を減らす
|
||
// 128バイトのテーブルを2個用意しておくこと
|
||
static void gf16_avx2_block32_2(unsigned char *input1, unsigned char *input2, unsigned char *output, unsigned int bsize, unsigned char *table)
|
||
{
|
||
__m256i mask, src0, src1, tmp0, tmp1, tmp2, tmp3;
|
||
__m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
|
||
|
||
// copy tables to local
|
||
tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
|
||
tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
|
||
tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
|
||
tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
|
||
|
||
// re-arrange table order (permute より blend の方が速いらしい)
|
||
tbl0 = _mm256_blend_epi32(tmp0, tmp2, 0xF0); // tbl0[low0][high2] <- 0x0f[lo][hi]
|
||
tbl1 = _mm256_blend_epi32(tmp1, tmp3, 0xF0); // tbl1[low1][high3] <- 0xf0[lo][hi]
|
||
tbl2 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03); // tbl2[high0][low2] <- 0x0f[lo][hi]
|
||
tbl3 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03); // tbl3[high1][low3] <- 0xf0[lo][hi]
|
||
|
||
tmp0 = _mm256_load_si256((__m256i *)table + 4);
|
||
tmp1 = _mm256_load_si256((__m256i *)table + 5);
|
||
tmp2 = _mm256_load_si256((__m256i *)table + 6);
|
||
tmp3 = _mm256_load_si256((__m256i *)table + 7);
|
||
tbl4 = _mm256_blend_epi32(tmp0, tmp2, 0xF0);
|
||
tbl5 = _mm256_blend_epi32(tmp1, tmp3, 0xF0);
|
||
tbl6 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03);
|
||
tbl7 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03);
|
||
|
||
// create mask for 32 entries
|
||
mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16
|
||
mask = _mm256_srli_epi16(mask, 12); // 0x000F *16
|
||
mask = _mm256_packus_epi16(mask, mask); // 0x0F *32
|
||
|
||
while (bsize != 0){
|
||
src0 = _mm256_load_si256((__m256i *)input1); // read source 32-bytes
|
||
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
|
||
|
||
tmp0 = _mm256_shuffle_epi8(tbl0, src0); // table look-up
|
||
tmp1 = _mm256_shuffle_epi8(tbl1, src1);
|
||
tmp2 = _mm256_shuffle_epi8(tbl2, src0);
|
||
tmp3 = _mm256_shuffle_epi8(tbl3, src1);
|
||
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
|
||
tmp2 = _mm256_xor_si256(tmp2, tmp3);
|
||
|
||
src0 = _mm256_load_si256((__m256i *)input2); // read source 32-bytes
|
||
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
|
||
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
|
||
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
|
||
|
||
tmp1 = _mm256_shuffle_epi8(tbl4, src0); // table look-up
|
||
tmp3 = _mm256_shuffle_epi8(tbl6, src0);
|
||
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
|
||
tmp2 = _mm256_xor_si256(tmp2, tmp3);
|
||
|
||
tmp1 = _mm256_shuffle_epi8(tbl5, src1); // table look-up
|
||
tmp3 = _mm256_shuffle_epi8(tbl7, src1);
|
||
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
|
||
tmp2 = _mm256_xor_si256(tmp2, tmp3);
|
||
|
||
src0 = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
|
||
tmp2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x01); // exchange low & high 128-bit
|
||
src0 = _mm256_xor_si256(src0, tmp0);
|
||
src0 = _mm256_xor_si256(src0, tmp2);
|
||
_mm256_store_si256((__m256i *)output, src0); // write dest 32-bytes
|
||
|
||
input1 += 32;
|
||
input2 += 32;
|
||
output += 32;
|
||
bsize -= 32;
|
||
}
|
||
|
||
// AVX-SSE 切り替えの回避
|
||
_mm256_zeroupper();
|
||
}
|
||
|
||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||
|
||
// データを並び替えることで、メモリーアクセスを高速化する
|
||
void galois_altmap32_change(unsigned char *data, unsigned int bsize)
|
||
{
|
||
__m128i xmm0, xmm1, xmm2, xmm3, mask;
|
||
|
||
mask = _mm_setzero_si128();
|
||
mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8
|
||
mask = _mm_srli_epi16(mask, 8); // 0x00FF *8
|
||
|
||
while (bsize != 0){
|
||
xmm1 = _mm_load_si128((__m128i *)data); // read 32-bytes
|
||
xmm3 = _mm_load_si128((__m128i *)data + 1);
|
||
|
||
xmm0 = _mm_and_si128(xmm1, mask); // erase higher byte
|
||
xmm2 = _mm_and_si128(xmm3, mask);
|
||
xmm1 = _mm_srli_epi16(xmm1, 8); // move higher byte to lower
|
||
xmm3 = _mm_srli_epi16(xmm3, 8);
|
||
xmm0 = _mm_packus_epi16(xmm0, xmm2); // select lower byte of each word
|
||
xmm1 = _mm_packus_epi16(xmm1, xmm3); // select higher byte of each word
|
||
|
||
_mm_store_si128((__m128i *)data, xmm0); // write 32-bytes
|
||
_mm_store_si128((__m128i *)data + 1, xmm1);
|
||
|
||
data += 32;
|
||
bsize -= 32;
|
||
}
|
||
}
|
||
|
||
// データの並びを元に戻す
|
||
void galois_altmap32_return(unsigned char *data, unsigned int bsize)
|
||
{
|
||
__m128i xmm0, xmm1, xmm2;
|
||
|
||
while (bsize != 0){
|
||
xmm1 = _mm_load_si128((__m128i *)data); // read 32-bytes
|
||
xmm2 = _mm_load_si128((__m128i *)data + 1);
|
||
|
||
xmm0 = _mm_unpacklo_epi8(xmm1, xmm2); // interleave lower and higher bytes
|
||
xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);
|
||
|
||
_mm_store_si128((__m128i *)data, xmm0); // write 32-bytes
|
||
_mm_store_si128((__m128i *)data + 1, xmm1);
|
||
|
||
data += 32;
|
||
bsize -= 32;
|
||
}
|
||
}
|
||
|
||
// 並び替えない場合
|
||
void galois_altmap_none(unsigned char *data, unsigned int bsize)
|
||
{
|
||
}
|
||
|
||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||
/*
|
||
from ParPar; "gf_w16_additions.c"
|
||
gf_w16_xor_lazy_sse_altmap_multiply_region
|
||
gf_w16_xor_lazy_sse_jit_altmap_multiply_region
|
||
*/
|
||
|
||
// 256バイトごとにビット単位の XOR で計算する方法
|
||
// input と output の領域は重ならないようにすること
|
||
static void gf16_sse2_block256(unsigned char *input, unsigned char *output, unsigned int bsize, int factor)
|
||
{
|
||
unsigned int i, bit;
|
||
unsigned int counts[16];
|
||
uintptr_t deptable[16][16];
|
||
__m128i depmask1, depmask2, polymask1, polymask2, addvals1, addvals2;
|
||
unsigned short tmp_depmask[16];
|
||
|
||
// calculate dependent bits
|
||
addvals1 = _mm_set_epi16(1<< 7, 1<< 6, 1<< 5, 1<< 4, 1<< 3, 1<< 2, 1<<1, 1<<0);
|
||
addvals2 = _mm_set_epi16(1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<9, 1<<8);
|
||
|
||
// duplicate each bit in the polynomial 16 times
|
||
polymask2 = _mm_set1_epi16(PRIM_POLY & 0xFFFF); // chop off top bit, although not really necessary
|
||
polymask1 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 8, 1<< 9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15));
|
||
polymask2 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 0, 1<< 1, 1<< 2, 1<< 3, 1<< 4, 1<< 5, 1<< 6, 1<< 7));
|
||
polymask1 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask1);
|
||
polymask2 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask2);
|
||
|
||
if (factor & (1<<15)){
|
||
// XOR
|
||
depmask1 = addvals1;
|
||
depmask2 = addvals2;
|
||
} else {
|
||
depmask1 = _mm_setzero_si128();
|
||
depmask2 = _mm_setzero_si128();
|
||
}
|
||
for (i = (1<<14); i; i >>= 1){
|
||
// rotate
|
||
__m128i last = _mm_shuffle_epi32(_mm_shufflelo_epi16(depmask1, 0), 0);
|
||
depmask1 = _mm_insert_epi16(
|
||
_mm_srli_si128(depmask1, 2),
|
||
_mm_extract_epi16(depmask2, 0),
|
||
7
|
||
);
|
||
depmask2 = _mm_srli_si128(depmask2, 2);
|
||
|
||
// XOR poly
|
||
depmask1 = _mm_xor_si128(depmask1, _mm_andnot_si128(polymask1, last));
|
||
depmask2 = _mm_xor_si128(depmask2, _mm_andnot_si128(polymask2, last));
|
||
|
||
if (factor & i){
|
||
// XOR
|
||
depmask1 = _mm_xor_si128(depmask1, addvals1);
|
||
depmask2 = _mm_xor_si128(depmask2, addvals2);
|
||
}
|
||
}
|
||
|
||
// generate needed tables
|
||
_mm_storeu_si128((__m128i*)(tmp_depmask), depmask1);
|
||
_mm_storeu_si128((__m128i*)(tmp_depmask + 8), depmask2);
|
||
for (bit = 0; bit < 16; bit++){
|
||
unsigned int cnt = 0;
|
||
for (i = 0; i < 16; i++){
|
||
if (tmp_depmask[bit] & (1<<i)){
|
||
deptable[bit][cnt++] = i<<4; // pre-multiply because x86 addressing can't do a x16; this saves a shift operation later
|
||
}
|
||
}
|
||
counts[bit] = cnt;
|
||
}
|
||
|
||
while (bsize != 0){
|
||
#define STEP(bit) { \
|
||
uintptr_t* deps = deptable[bit]; \
|
||
__m128i tmp = _mm_load_si128((__m128i *)output + bit); \
|
||
switch (counts[bit]){ \
|
||
case 16: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[15])); \
|
||
case 15: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[14])); \
|
||
case 14: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[13])); \
|
||
case 13: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[12])); \
|
||
case 12: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[11])); \
|
||
case 11: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[10])); \
|
||
case 10: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 9])); \
|
||
case 9: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 8])); \
|
||
case 8: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 7])); \
|
||
case 7: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 6])); \
|
||
case 6: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 5])); \
|
||
case 5: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 4])); \
|
||
case 4: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 3])); \
|
||
case 3: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 2])); \
|
||
case 2: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 1])); \
|
||
case 1: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 0])); \
|
||
} \
|
||
_mm_store_si128((__m128i *)output + bit, tmp); \
|
||
}
|
||
STEP( 0)
|
||
STEP( 1)
|
||
STEP( 2)
|
||
STEP( 3)
|
||
STEP( 4)
|
||
STEP( 5)
|
||
STEP( 6)
|
||
STEP( 7)
|
||
STEP( 8)
|
||
STEP( 9)
|
||
STEP(10)
|
||
STEP(11)
|
||
STEP(12)
|
||
STEP(13)
|
||
STEP(14)
|
||
STEP(15)
|
||
#undef STEP
|
||
input += 256;
|
||
output += 256;
|
||
bsize -= 256;
|
||
}
|
||
}
|
||
|
||
// bsize が 0 にならないようにすること
|
||
static void gf16_sse2_block256_jit(unsigned char *input, unsigned char *output, unsigned int bsize, int factor)
|
||
{
|
||
FAST_U32 i, bit;
|
||
long inBit;
|
||
__m128i depmask1, depmask2, polymask1, polymask2, addvals1, addvals2;
|
||
__m128i common_mask;
|
||
unsigned short tmp_depmask[16], common_depmask[8];
|
||
unsigned char * pos_startloop;
|
||
unsigned char *jit_exec, *jit_ptr;
|
||
int j, thread_id;
|
||
|
||
// calculate dependent bits
|
||
addvals1 = _mm_set_epi16(1<< 7, 1<< 6, 1<< 5, 1<< 4, 1<< 3, 1<< 2, 1<<1, 1<<0);
|
||
addvals2 = _mm_set_epi16(1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<9, 1<<8);
|
||
|
||
// duplicate each bit in the polynomial 16 times
|
||
polymask2 = _mm_set1_epi16(PRIM_POLY & 0xFFFF); // chop off top bit, although not really necessary
|
||
polymask1 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 8, 1<< 9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15));
|
||
polymask2 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 0, 1<< 1, 1<< 2, 1<< 3, 1<< 4, 1<< 5, 1<< 6, 1<< 7));
|
||
polymask1 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask1);
|
||
polymask2 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask2);
|
||
|
||
if (factor & (1<<15)){
|
||
// XOR
|
||
depmask1 = addvals1;
|
||
depmask2 = addvals2;
|
||
} else {
|
||
depmask1 = _mm_setzero_si128();
|
||
depmask2 = _mm_setzero_si128();
|
||
}
|
||
for (i = (1<<14); i; i >>= 1){
|
||
// rotate
|
||
__m128i last = _mm_shuffle_epi32(_mm_shufflelo_epi16(depmask1, 0), 0);
|
||
depmask1 = _mm_insert_epi16(
|
||
_mm_srli_si128(depmask1, 2),
|
||
_mm_extract_epi16(depmask2, 0),
|
||
7
|
||
);
|
||
depmask2 = _mm_srli_si128(depmask2, 2);
|
||
|
||
// XOR poly
|
||
depmask1 = _mm_xor_si128(depmask1, _mm_andnot_si128(polymask1, last));
|
||
depmask2 = _mm_xor_si128(depmask2, _mm_andnot_si128(polymask2, last));
|
||
|
||
if (factor & i){
|
||
// XOR
|
||
depmask1 = _mm_xor_si128(depmask1, addvals1);
|
||
depmask2 = _mm_xor_si128(depmask2, addvals2);
|
||
}
|
||
}
|
||
|
||
// attempt to remove some redundant XOR ops with a simple heuristic
|
||
// heuristic: we just find common XOR elements between bit pairs
|
||
{
|
||
__m128i tmp1, tmp2;
|
||
// first, we need to re-arrange words so that we can perform bitwise AND on neighbouring pairs
|
||
// unfortunately, PACKUSDW is SSE4.1 only, so emulate it with shuffles
|
||
// 01234567 -> 02461357
|
||
tmp1 = _mm_shuffle_epi32(
|
||
_mm_shufflelo_epi16(
|
||
_mm_shufflehi_epi16(depmask1, 0xD8), /* 0xD8 == 0b11011000 */
|
||
0xD8
|
||
),
|
||
0xD8
|
||
);
|
||
tmp2 = _mm_shuffle_epi32(
|
||
_mm_shufflelo_epi16(
|
||
_mm_shufflehi_epi16(depmask2, 0xD8),
|
||
0xD8
|
||
),
|
||
0xD8
|
||
);
|
||
common_mask = _mm_and_si128(
|
||
// [02461357, 8ACE9BDF] -> [02468ACE, 13579BDF]
|
||
_mm_unpacklo_epi64(tmp1, tmp2),
|
||
_mm_unpackhi_epi64(tmp1, tmp2)
|
||
);
|
||
// we have the common elements between pairs, but it doesn't make sense to process a separate queue if there's only one common element (0 XORs), so eliminate those
|
||
common_mask = _mm_andnot_si128(_mm_cmpeq_epi16(
|
||
_mm_setzero_si128(),
|
||
// "(v & (v-1)) == 0" is true if only zero/one bit is set in each word
|
||
_mm_and_si128(common_mask, _mm_sub_epi16(common_mask, _mm_set1_epi16(1)))
|
||
), common_mask);
|
||
// we now have a common elements mask without 1-bit words, just simply merge stuff in
|
||
depmask1 = _mm_xor_si128(depmask1, _mm_unpacklo_epi16(common_mask, common_mask));
|
||
depmask2 = _mm_xor_si128(depmask2, _mm_unpackhi_epi16(common_mask, common_mask));
|
||
_mm_storeu_si128((__m128i*)common_depmask, common_mask);
|
||
}
|
||
|
||
_mm_storeu_si128((__m128i*)(tmp_depmask), depmask1);
|
||
_mm_storeu_si128((__m128i*)(tmp_depmask + 8), depmask2);
|
||
|
||
// Multi-threading だとスレッドごとに実行領域を分離しないとアクセス違反エラーが発生する
|
||
thread_id = GetCurrentThreadId(); // 自分のスレッド ID を取得する
|
||
for (j = 0; j < MAX_CPU; j++){ // 対応するスレッド個数は MAX_CPU 個まで
|
||
if (jit_id[j] == thread_id)
|
||
break;
|
||
}
|
||
if (j == MAX_CPU){ // 初期状態では jit_code 内は全て 0 なので jit_id も 0 だけ
|
||
for (j = 0; j < MAX_CPU; j++){
|
||
if (InterlockedCompareExchange(jit_id + j, thread_id, 0) == 0) // 0と置き換えたなら
|
||
break;
|
||
}
|
||
}
|
||
jit_exec = jit_code + 4096 * j;
|
||
jit_ptr = jit_exec;
|
||
|
||
#ifdef _WIN64
|
||
_jit_push(&jit_ptr, BP);
|
||
_jit_mov_r(&jit_ptr, BP, SP);
|
||
// align pointer (avoid SP because stuff is encoded differently with it)
|
||
_jit_mov_r(&jit_ptr, AX, SP);
|
||
_jit_and_i(&jit_ptr, AX, 0xF);
|
||
_jit_sub_r(&jit_ptr, BP, AX);
|
||
|
||
// make Windows happy and save XMM6-15 registers
|
||
// ideally should be done by this function, not JIT code, but MSVC has a convenient policy of no inline ASM
|
||
for (i = 6; i < 16; i++)
|
||
_jit_movaps_store(&jit_ptr, BP, -((int32_t)i-5)*16, (uint8_t)i);
|
||
#endif
|
||
|
||
// adding 128 to the destination pointer allows the register offset to be coded in 1 byte
|
||
// eg: 'movdqa xmm0, [rdx+0x90]' is 8 bytes, whilst 'movdqa xmm0, [rdx-0x60]' is 5 bytes
|
||
_jit_mov_i(&jit_ptr, AX, (intptr_t)input + 128);
|
||
_jit_mov_i(&jit_ptr, DX, (intptr_t)output + 128);
|
||
_jit_mov_i(&jit_ptr, CX, (intptr_t)output + bsize + 128);
|
||
|
||
_jit_align32(&jit_ptr);
|
||
pos_startloop = jit_ptr;
|
||
|
||
//_jit_movaps_load(reg, xreg, offs)
|
||
// (we just save a conditional by hardcoding this)
|
||
#define _LD_APS(xreg, mreg, offs) \
|
||
*(int32_t*)(jit_ptr) = 0x40280F + ((xreg) <<19) + ((mreg) <<16) + (((offs)&0xFF) <<24); \
|
||
jit_ptr += 4
|
||
#define _ST_APS(mreg, offs, xreg) \
|
||
*(int32_t*)(jit_ptr) = 0x40290F + ((xreg) <<19) + ((mreg) <<16) + (((offs)&0xFF) <<24); \
|
||
jit_ptr += 4
|
||
#define _LD_APS64(xreg, mreg, offs) \
|
||
*(int64_t*)(jit_ptr) = 0x40280F44 + ((xreg-8) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \
|
||
jit_ptr += 5
|
||
#define _ST_APS64(mreg, offs, xreg) \
|
||
*(int64_t*)(jit_ptr) = 0x40290F44 + ((xreg-8) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \
|
||
jit_ptr += 5
|
||
|
||
#ifdef _WIN64
|
||
#define _LD_DQA(xreg, mreg, offs) \
|
||
*(int64_t*)(jit_ptr) = 0x406F0F66 + ((xreg) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \
|
||
jit_ptr += 5
|
||
#define _ST_DQA(mreg, offs, xreg) \
|
||
*(int64_t*)(jit_ptr) = 0x407F0F66 + ((xreg) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \
|
||
jit_ptr += 5
|
||
#else
|
||
#define _LD_DQA(xreg, mreg, offs) \
|
||
*(int32_t*)(jit_ptr) = 0x406F0F66 + ((xreg) <<27) + ((mreg) <<24); \
|
||
*(jit_ptr +4) = (uint8_t)((offs)&0xFF); \
|
||
jit_ptr += 5
|
||
#define _ST_DQA(mreg, offs, xreg) \
|
||
*(int32_t*)(jit_ptr) = 0x407F0F66 + ((xreg) <<27) + ((mreg) <<24); \
|
||
*(jit_ptr +4) = (uint8_t)((offs)&0xFF); \
|
||
jit_ptr += 5
|
||
#endif
|
||
#define _LD_DQA64(xreg, mreg, offs) \
|
||
*(int64_t*)(jit_ptr) = 0x406F0F4466 + ((int64_t)(xreg-8) <<35) + ((int64_t)(mreg) <<32) + ((int64_t)((offs)&0xFF) <<40); \
|
||
jit_ptr += 6
|
||
#define _ST_DQA64(mreg, offs, xreg) \
|
||
*(int64_t*)(jit_ptr) = 0x407F0F4466 + ((int64_t)(xreg-8) <<35) + ((int64_t)(mreg) <<32) + ((int64_t)((offs)&0xFF) <<40); \
|
||
jit_ptr += 6
|
||
|
||
//_jit_xorps_m(reg, AX, offs<<4);
|
||
#define _XORPS_M_(reg, offs, tr) \
|
||
*(int32_t*)(jit_ptr) = (0x40570F + ((reg) << 19) + (((offs)&0xFF) <<28)) ^ (tr)
|
||
#define _C_XORPS_M(reg, offs, c) \
|
||
_XORPS_M_(reg, offs, 0); \
|
||
jit_ptr += (c)<<2
|
||
#define _XORPS_M64_(reg, offs, tr) \
|
||
*(int64_t*)(jit_ptr) = (0x40570F44 + (((reg)-8) << 27) + ((int64_t)((offs)&0xFF) <<36)) ^ ((tr)<<8)
|
||
#define _C_XORPS_M64(reg, offs, c) \
|
||
_XORPS_M64_(reg, offs, 0); \
|
||
jit_ptr += ((c)<<2)+(c)
|
||
|
||
//_jit_pxor_m(1, AX, offs<<4);
|
||
#ifdef _WIN64
|
||
#define _PXOR_M_(reg, offs, tr) \
|
||
*(int64_t*)(jit_ptr) = (0x40EF0F66 + ((reg) << 27) + ((int64_t)((offs)&0xFF) << 36)) ^ (tr)
|
||
#else
|
||
#define _PXOR_M_(reg, offs, tr) \
|
||
*(int32_t*)(jit_ptr) = (0x40EF0F66 + ((reg) << 27)) ^ (tr); \
|
||
*(jit_ptr +4) = (uint8_t)(((offs)&0xFF) << 4)
|
||
#endif
|
||
#define _PXOR_M(reg, offs) \
|
||
_PXOR_M_(reg, offs, 0); \
|
||
jit_ptr += 5
|
||
#define _C_PXOR_M(reg, offs, c) \
|
||
_PXOR_M_(reg, offs, 0); \
|
||
jit_ptr += ((c)<<2)+(c)
|
||
#define _PXOR_M64_(reg, offs, tr) \
|
||
*(int64_t*)(jit_ptr) = (0x40EF0F4466 + ((int64_t)((reg)-8) << 35) + ((int64_t)((offs)&0xFF) << 44)) ^ ((tr)<<8)
|
||
#define _C_PXOR_M64(reg, offs, c) \
|
||
_PXOR_M64_(reg, offs, 0); \
|
||
jit_ptr += ((c)<<2)+((c)<<1)
|
||
|
||
//_jit_xorps_r(r2, r1)
|
||
#define _XORPS_R_(r2, r1, tr) \
|
||
*(int32_t*)(jit_ptr) = (0xC0570F + ((r2) <<19) + ((r1) <<16)) ^ (tr)
|
||
#define _XORPS_R(r2, r1) \
|
||
_XORPS_R_(r2, r1, 0); \
|
||
jit_ptr += 3
|
||
#define _C_XORPS_R(r2, r1, c) \
|
||
_XORPS_R_(r2, r1, 0); \
|
||
jit_ptr += ((c)<<1)+(c)
|
||
// r2 is always < 8, r1 here is >= 8
|
||
#define _XORPS_R64_(r2, r1, tr) \
|
||
*(int32_t*)(jit_ptr) = (0xC0570F41 + ((r2) <<27) + ((r1) <<24)) ^ ((tr)<<8)
|
||
#define _C_XORPS_R64(r2, r1, c) \
|
||
_XORPS_R64_(r2, r1, 0); \
|
||
jit_ptr += (c)<<2
|
||
|
||
//_jit_pxor_r(r2, r1)
|
||
#define _PXOR_R_(r2, r1, tr) \
|
||
*(int32_t*)(jit_ptr) = (0xC0EF0F66 + ((r2) <<27) + ((r1) <<24)) ^ (tr)
|
||
#define _PXOR_R(r2, r1) \
|
||
_PXOR_R_(r2, r1, 0); \
|
||
jit_ptr += 4
|
||
#define _C_PXOR_R(r2, r1, c) \
|
||
_PXOR_R_(r2, r1, 0); \
|
||
jit_ptr += (c)<<2
|
||
#define _PXOR_R64_(r2, r1, tr) \
|
||
*(int64_t*)(jit_ptr) = (0xC0EF0F4166 + ((int64_t)(r2) <<35) + ((int64_t)(r1) <<32)) ^ (((int64_t)tr)<<8)
|
||
#define _C_PXOR_R64(r2, r1, c) \
|
||
_PXOR_R64_(r2, r1, 0); \
|
||
jit_ptr += ((c)<<2)+(c)
|
||
|
||
// optimised mix of xor/mov operations
|
||
#define _MOV_OR_XOR_FP_M(reg, offs, flag, c) \
|
||
_XORPS_M_(reg, offs, flag); \
|
||
flag &= (c)-1; \
|
||
jit_ptr += (c)<<2
|
||
#define _MOV_OR_XOR_FP_M64(reg, offs, flag, c) \
|
||
_XORPS_M64_(reg, offs, flag); \
|
||
flag &= (c)-1; \
|
||
jit_ptr += ((c)<<2)+(c)
|
||
#define _MOV_OR_XOR_FP_INIT (0x570F ^ 0x280F)
|
||
|
||
#define _MOV_OR_XOR_INT_M(reg, offs, flag, c) \
|
||
_PXOR_M_(reg, offs, flag); \
|
||
flag &= (c)-1; \
|
||
jit_ptr += ((c)<<2)+(c)
|
||
#define _MOV_OR_XOR_INT_M64(reg, offs, flag, c) \
|
||
_PXOR_M64_(reg, offs, flag); \
|
||
flag &= (c)-1; \
|
||
jit_ptr += ((c)<<2)+((c)<<1)
|
||
#define _MOV_OR_XOR_INT_INIT (0xEF0F00 ^ 0x6F0F00)
|
||
|
||
#define _MOV_OR_XOR_R_FP(r2, r1, flag, c) \
|
||
_XORPS_R_(r2, r1, flag); \
|
||
flag &= (c)-1; \
|
||
jit_ptr += ((c)<<1)+(c)
|
||
#define _MOV_OR_XOR_R64_FP(r2, r1, flag, c) \
|
||
_XORPS_R64_(r2, r1, flag); \
|
||
flag &= (c)-1; \
|
||
jit_ptr += (c)<<2
|
||
|
||
#define _MOV_OR_XOR_R_INT(r2, r1, flag, c) \
|
||
_PXOR_R_(r2, r1, flag); \
|
||
flag &= (c)-1; \
|
||
jit_ptr += (c)<<2
|
||
#define _MOV_OR_XOR_R64_INT(r2, r1, flag, c) \
|
||
_PXOR_R64_(r2, r1, flag); \
|
||
flag &= (c)-1; \
|
||
jit_ptr += ((c)<<2)+(c)
|
||
|
||
// generate code
|
||
#ifdef _WIN64
|
||
// preload upper 13 inputs into registers
|
||
#define _XORS_FROM_MEMORY 3
|
||
for (inBit = 3; inBit < 8; inBit++){
|
||
_LD_APS(inBit, AX, (inBit-8)<<4);
|
||
}
|
||
for (; inBit<16; inBit++){
|
||
_LD_APS64(inBit, AX, (inBit-8)<<4);
|
||
}
|
||
#else
|
||
// can only fit 5 in 32-bit mode :(
|
||
#define _XORS_FROM_MEMORY 11
|
||
for (inBit = 3; inBit < 8; inBit++){ // despite appearances, we're actually loading the top 5, not mid 5
|
||
_LD_APS(inBit, AX, inBit<<4);
|
||
}
|
||
#endif
|
||
for (bit = 0; bit < 16; bit += 2){
|
||
int destOffs = (int)((bit<<4)-128);
|
||
FAST_U32 movC = _MOV_OR_XOR_INT_INIT;
|
||
FAST_U16 mask1 = tmp_depmask[bit], mask2 = tmp_depmask[bit+1], maskC = common_depmask[bit>>1];
|
||
_LD_APS(0, DX, destOffs);
|
||
_LD_DQA(1, DX, destOffs+16);
|
||
|
||
for (inBit = -8; inBit < (_XORS_FROM_MEMORY-8); inBit++){
|
||
_MOV_OR_XOR_INT_M(2, inBit, movC, maskC & 1);
|
||
_C_XORPS_M(0, inBit, mask1 & 1);
|
||
_C_PXOR_M(1, inBit, mask2 & 1);
|
||
mask1 >>= 1;
|
||
mask2 >>= 1;
|
||
maskC >>= 1;
|
||
}
|
||
// at least 5 can come from registers
|
||
for (inBit = 3; inBit < 8; inBit++){
|
||
_MOV_OR_XOR_R_INT(2, inBit, (int32_t)movC, maskC & 1);
|
||
_C_XORPS_R(0, inBit, mask1 & 1);
|
||
_C_PXOR_R(1, inBit, mask2 & 1);
|
||
mask1 >>= 1;
|
||
mask2 >>= 1;
|
||
maskC >>= 1;
|
||
}
|
||
#ifdef _WIN64
|
||
// more XORs can come from 64-bit registers
|
||
for (inBit = 0; inBit < 8; inBit++){
|
||
_MOV_OR_XOR_R64_INT(2, inBit, movC, maskC & 1);
|
||
_C_XORPS_R64(0, inBit, mask1 & 1);
|
||
_C_PXOR_R64(1, inBit, mask2 & 1);
|
||
mask1 >>= 1;
|
||
mask2 >>= 1;
|
||
maskC >>= 1;
|
||
}
|
||
#endif
|
||
if (!movC){
|
||
_XORPS_R(0, 2);
|
||
_PXOR_R(1, 2); // penalty?
|
||
}
|
||
_ST_APS(DX, destOffs, 0);
|
||
_ST_DQA(DX, destOffs+16, 1);
|
||
}
|
||
#undef _XORS_FROM_MEMORY
|
||
|
||
_jit_add_i(&jit_ptr, AX, 256);
|
||
_jit_add_i(&jit_ptr, DX, 256);
|
||
|
||
_jit_cmp_r(&jit_ptr, DX, CX);
|
||
_jit_jcc(&jit_ptr, JL, pos_startloop);
|
||
|
||
#ifdef _WIN64
|
||
for (i = 6; i < 16; i++)
|
||
_jit_movaps_load(&jit_ptr, (uint8_t)i, BP, -((int32_t)i-5)*16);
|
||
_jit_pop(&jit_ptr, BP);
|
||
#endif
|
||
|
||
_jit_ret(&jit_ptr);
|
||
|
||
// exec
|
||
(*(void(*)(void))jit_exec)();
|
||
|
||
}
|
||
|
||
/*
|
||
XOR method is based on ParPar.
|
||
https://github.com/animetosho/ParPar/
|
||
*/
|
||
|
||
// bit 単位で XOR するには、256バイトごとに並び替える
|
||
void galois_altmap256_change(unsigned char *data, unsigned int bsize)
|
||
{
|
||
unsigned short dtmp[128];
|
||
int i, j;
|
||
__m128i ta, tb, lmask, th, tl;
|
||
|
||
lmask = _mm_set1_epi16(0xff);
|
||
|
||
while (bsize != 0){
|
||
for (j = 0; j < 8; j++){
|
||
ta = _mm_load_si128((__m128i *)data); // read 32-bytes
|
||
tb = _mm_load_si128((__m128i *)data + 1);
|
||
|
||
// split to high/low parts
|
||
th = _mm_packus_epi16(_mm_srli_epi16(tb, 8), _mm_srli_epi16(ta, 8));
|
||
tl = _mm_packus_epi16(_mm_and_si128(tb, lmask), _mm_and_si128(ta, lmask));
|
||
|
||
// save to dest by extracting 16-bit masks
|
||
dtmp[0 + j] = _mm_movemask_epi8(th);
|
||
for (i = 1; i < 8; i++){
|
||
th = _mm_slli_epi16(th, 1); // byte shift would be nicer, but ultimately doesn't matter here
|
||
dtmp[i*8 + j] = _mm_movemask_epi8(th);
|
||
}
|
||
dtmp[64 + j] = _mm_movemask_epi8(tl);
|
||
for (i = 1; i < 8; i++){
|
||
tl = _mm_slli_epi16(tl, 1);
|
||
dtmp[64 + i*8 + j] = _mm_movemask_epi8(tl);
|
||
}
|
||
data += 32;
|
||
}
|
||
// we only really need to copy temp -> dest
|
||
memcpy(data - 256, dtmp, 256);
|
||
bsize -= 256;
|
||
}
|
||
}
|
||
|
||
void galois_altmap256_return(unsigned char *data, unsigned int bsize)
|
||
{
|
||
unsigned short dtmp[128];
|
||
int i, j;
|
||
__m128i ta, tb, lmask, th, tl;
|
||
|
||
th = _mm_setzero_si128(); // shut up compiler warning
|
||
tl = _mm_setzero_si128();
|
||
lmask = _mm_set1_epi16(0xff);
|
||
|
||
while (bsize != 0){
|
||
for (j = 0; j < 8; j++){
|
||
// load in pattern: [0011223344556677] [8899AABBCCDDEEFF]
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 240), 0);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 112), 0);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 224), 1);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 96), 1);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 208), 2);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 80), 2);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 192), 3);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 64), 3);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 176), 4);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 48), 4);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 160), 5);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 32), 5);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 144), 6);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 16), 6);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 128), 7);
|
||
th = _mm_insert_epi16(th, *(int *)data, 7);
|
||
|
||
// swizzle to [0123456789ABCDEF] [0123456789ABCDEF]
|
||
ta = _mm_packus_epi16(_mm_srli_epi16(tl, 8),_mm_srli_epi16(th, 8));
|
||
tb = _mm_packus_epi16(_mm_and_si128(tl, lmask),_mm_and_si128(th, lmask));
|
||
|
||
// extract top bits
|
||
dtmp[j*16 + 7] = _mm_movemask_epi8(ta);
|
||
dtmp[j*16 + 15] = _mm_movemask_epi8(tb);
|
||
for (i = 1; i < 8; i++){
|
||
ta = _mm_slli_epi16(ta, 1);
|
||
tb = _mm_slli_epi16(tb, 1);
|
||
dtmp[j*16 + 7-i] = _mm_movemask_epi8(ta);
|
||
dtmp[j*16 + 15-i] = _mm_movemask_epi8(tb);
|
||
}
|
||
data += 2;
|
||
}
|
||
|
||
// we only really need to copy temp -> dest
|
||
memcpy(data - 16, dtmp, 256);
|
||
data += 240;
|
||
bsize -= 256;
|
||
}
|
||
}
|
||
|
||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||
|
||
// 複数のテーブルを使って掛け算する
|
||
void galois_region_multiply(
|
||
unsigned short *r1, // Region to multiply
|
||
unsigned short *r2, // Products go here
|
||
unsigned int count, // Count of number in short
|
||
int factor) // Number to multiply by
|
||
{
|
||
if (factor <= 1){
|
||
if (factor == 0)
|
||
return;
|
||
|
||
// アドレスが 4の倍数で無い場合は 4バイト単位で計算する効率が落ちる
|
||
if ((ULONG_PTR)r2 & 2){
|
||
// そこで最初の 1個(2バイト)だけ普通に計算する
|
||
*r2 ^= *r1;
|
||
r1++;
|
||
r2++;
|
||
count--;
|
||
}
|
||
|
||
while (count >= 2){ // 2個(4バイト)ずつ計算する
|
||
((unsigned int *)r2)[0] ^= ((unsigned int *)r1)[0];
|
||
r1 += 2;
|
||
r2 += 2;
|
||
count -= 2;
|
||
}
|
||
if (count == 1) // 最後に余ったやつだけ普通に計算する
|
||
*r2 ^= *r1;
|
||
return;
|
||
}
|
||
|
||
if (count >= 64){ // 64バイト以上なら掛け算用のテーブルを使った方が速い
|
||
#ifndef NO_SIMD
|
||
if (cpu_flag & 16){ // AVX2 対応なら
|
||
__declspec( align(32) ) unsigned char small_table[128];
|
||
int s, d;
|
||
|
||
create_eight_table_avx2(small_table, factor);
|
||
|
||
// アドレスが 32の倍数で無い場合は 32バイト単位で計算する効率が落ちる
|
||
while ((ULONG_PTR)r2 & 0x1E){
|
||
// そこで最初の 1~15個(2~30バイト)だけ普通に計算する
|
||
s = r1[0];
|
||
d = r2[0];
|
||
d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
|
||
r2[0] = (unsigned short)d;
|
||
r1++;
|
||
r2++;
|
||
count--;
|
||
}
|
||
|
||
// 16個ずつ計算するので 16の倍数にする
|
||
gf16_avx2_block32u((unsigned char *)r1, (unsigned char *)r2,
|
||
(count & 0xFFFFFFF0) << 1, small_table);
|
||
r1 += count & 0xFFFFFFF0;
|
||
r2 += count & 0xFFFFFFF0;
|
||
count &= 15;
|
||
|
||
// 残りは 1個ずつ計算する
|
||
while (count != 0){
|
||
s = r1[0];
|
||
d = r2[0];
|
||
d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
|
||
r2[0] = (unsigned short)d;
|
||
r1++;
|
||
r2++;
|
||
count--;
|
||
}
|
||
|
||
} else if (cpu_flag & 1){ // SSSE3 対応なら
|
||
__declspec( align(16) ) unsigned char small_table[128];
|
||
int s, d;
|
||
|
||
create_eight_table(small_table, factor);
|
||
|
||
// アドレスが 16の倍数で無い場合は 16バイト単位で計算する効率が落ちる
|
||
while ((ULONG_PTR)r2 & 0xE){
|
||
// そこで最初の 1~7個(2~14バイト)だけ普通に計算する
|
||
s = r1[0];
|
||
d = r2[0];
|
||
d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
|
||
r2[0] = (unsigned short)d;
|
||
r1++;
|
||
r2++;
|
||
count--;
|
||
}
|
||
|
||
if (sse_unit == 16){
|
||
// 8個ずつ計算するので 8の倍数にする
|
||
gf16_ssse3_block16u((unsigned char *)r1, (unsigned char *)r2,
|
||
(count & 0xFFFFFFF8) << 1, small_table);
|
||
r1 += count & 0xFFFFFFF8;
|
||
r2 += count & 0xFFFFFFF8;
|
||
count &= 7;
|
||
} else {
|
||
// 16個ずつ計算するので 16の倍数にする
|
||
gf16_ssse3_block32u((unsigned char *)r1, (unsigned char *)r2,
|
||
(count & 0xFFFFFFF0) << 1, small_table);
|
||
r1 += count & 0xFFFFFFF0;
|
||
r2 += count & 0xFFFFFFF0;
|
||
count &= 15;
|
||
}
|
||
|
||
// 残りは 1個ずつ計算する
|
||
while (count != 0){
|
||
s = r1[0];
|
||
d = r2[0];
|
||
d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
|
||
r2[0] = (unsigned short)d;
|
||
r1++;
|
||
r2++;
|
||
count--;
|
||
}
|
||
|
||
} else { // Combined Multi Table support (2 tables of 256-entries)
|
||
#endif
|
||
unsigned int mtab[256 * 2];
|
||
|
||
create_two_table(mtab, factor); // build combined multiplication tables
|
||
|
||
// アドレスが 8の倍数で無い場合は 8バイト単位で計算する効率が落ちる
|
||
while ((ULONG_PTR)r2 & 6){
|
||
// そこで最初の 1~3個(2~6バイト)だけ普通に計算する
|
||
r2[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]];
|
||
r1++;
|
||
r2++;
|
||
count--;
|
||
}
|
||
|
||
#ifndef _WIN64 // 32-bit 版なら MMX を使う
|
||
#ifndef NO_SIMD
|
||
// 4個(8バイト)ずつ計算するので 4の倍数にする
|
||
DoBlock8((unsigned char *)r1, (unsigned char *)r2, (count & 0xFFFFFFFC) << 1, mtab);
|
||
r1 += count & 0xFFFFFFFC;
|
||
r2 += count & 0xFFFFFFFC;
|
||
count &= 3;
|
||
|
||
#else // MMX を使わないなら
|
||
// バッファーを 32-bit整数として扱う
|
||
while (count >= 2){ // 2個(4バイト)ずつ計算する
|
||
// 先に計算しておいた 2個の参照テーブルを使う
|
||
((unsigned int *)r2)[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^
|
||
((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16);
|
||
r1 += 2;
|
||
r2 += 2;
|
||
count -= 2;
|
||
}
|
||
#endif
|
||
|
||
#else // 64-bit 版なら 64-bit 整数を使う
|
||
// バッファーを 64-bit整数として扱う
|
||
while (count >= 4){ // 4個(8バイト)ずつ計算する
|
||
// 先に計算しておいた 2個の参照テーブルを使う
|
||
((unsigned __int64 *)r2)[0] ^=
|
||
mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^
|
||
((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16) ^
|
||
((unsigned __int64)(mtab[((unsigned char *)r1)[4]] ^ mtab[256 + ((unsigned char *)r1)[5]]) << 32) ^
|
||
((unsigned __int64)(mtab[((unsigned char *)r1)[6]] ^ mtab[256 + ((unsigned char *)r1)[7]]) << 48);
|
||
r1 += 4;
|
||
r2 += 4;
|
||
count -= 4;
|
||
}
|
||
#endif
|
||
|
||
// 残りは 1個ずつ計算する
|
||
while (count != 0){
|
||
r2[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]];
|
||
r1++;
|
||
r2++;
|
||
count--;
|
||
}
|
||
#ifndef NO_SIMD
|
||
}
|
||
#endif
|
||
|
||
} else { // 小さいデータは普通に計算する
|
||
int log_y = galois_log_table[factor];
|
||
|
||
while (count != 0){
|
||
r2[0] ^= galois_multiply_fix(r1[0], log_y);
|
||
r1++;
|
||
r2++;
|
||
count--;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 行列の割り算用、微妙に速いけど大差無し
|
||
void galois_region_divide(
|
||
unsigned short *r1, // Region to divide. products go here
|
||
unsigned int count, // Count of number in short
|
||
int factor) // Number to divide by
|
||
{
|
||
factor = galois_reciprocal(factor); // factor = 1 / factor
|
||
|
||
if (count >= 64){
|
||
// 行列サイズが小さいのでテーブル作成に時間がかかって、全く速くならない・・・
|
||
/*
|
||
#ifndef NO_SIMD
|
||
if (cpu_flag & 16){ // AVX2 対応なら
|
||
__declspec( align(32) ) unsigned char small_table[128];
|
||
int s, d;
|
||
|
||
create_eight_table_avx2(small_table, factor);
|
||
|
||
// アドレスが 32の倍数で無い場合は 32バイト単位で計算する効率が落ちる
|
||
while ((ULONG_PTR)r1 & 0x1E){
|
||
// そこで最初の 1~15個(2~30バイト)だけ普通に計算する
|
||
s = r1[0];
|
||
d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
|
||
r1[0] = (unsigned short)d;
|
||
r1++;
|
||
count--;
|
||
}
|
||
|
||
// 16個ずつ計算するので 16の倍数にする
|
||
gf16_avx2_block32s((unsigned char *)r1, (count & 0xFFFFFFF0) << 1, small_table);
|
||
r1 += count & 0xFFFFFFF0;
|
||
count &= 15;
|
||
|
||
// 残りは 1個ずつ計算する
|
||
while (count != 0){
|
||
s = r1[0];
|
||
d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
|
||
r1[0] = (unsigned short)d;
|
||
r1++;
|
||
count--;
|
||
}
|
||
|
||
} else if (cpu_flag & 1){ // SSSE3 対応なら
|
||
__declspec( align(16) ) unsigned char small_table[128];
|
||
int s, d;
|
||
|
||
create_eight_table(small_table, factor);
|
||
|
||
// アドレスが 16の倍数で無い場合は 16バイト単位で計算する効率が落ちる
|
||
while ((ULONG_PTR)r1 & 0xE){
|
||
// そこで最初の 1~7個(2~14バイト)だけ普通に計算する
|
||
s = r1[0];
|
||
d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
|
||
r1[0] = (unsigned short)d;
|
||
r1++;
|
||
count--;
|
||
}
|
||
|
||
// 8個ずつ計算するので 8の倍数にする
|
||
gf16_ssse3_block16s((unsigned char *)r1, (count & 0xFFFFFFF8) << 1, small_table);
|
||
r1 += count & 0xFFFFFFF8;
|
||
count &= 7;
|
||
|
||
// 残りは 1個ずつ計算する
|
||
while (count != 0){
|
||
s = r1[0];
|
||
d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
|
||
s = s >> 4;
|
||
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
|
||
r1[0] = (unsigned short)d;
|
||
r1++;
|
||
count--;
|
||
}
|
||
|
||
} else { // Combined Multi Table support (2 tables of 256-entries)
|
||
#endif
|
||
*/
|
||
unsigned int mtab[256 * 2];
|
||
|
||
create_two_table(mtab, factor); // 掛け算用のテーブルをその場で構成する
|
||
|
||
// アドレスが 4の倍数で無い場合は 4バイト単位で計算する効率が落ちる
|
||
if (((ULONG_PTR)r1 & 2) != 0){
|
||
// そこで最初の 1個(2バイト)だけ普通に計算する
|
||
r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]);
|
||
r1++;
|
||
count--;
|
||
}
|
||
|
||
// バッファーを 32-bit整数として扱う
|
||
while (count >= 2){ // 2個(4バイト)ずつ計算する
|
||
// 先に計算しておいた 2個の参照テーブルを使う
|
||
((unsigned int *)r1)[0] = mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^
|
||
((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16);
|
||
r1 += 2;
|
||
count -= 2;
|
||
}
|
||
// 奇数なら最後に 1個余る
|
||
if (count == 1)
|
||
r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]);
|
||
/*
|
||
#ifndef NO_SIMD
|
||
}
|
||
#endif
|
||
*/
|
||
|
||
} else { // 小さいデータは普通に計算する
|
||
int log_y = galois_log_table[factor];
|
||
|
||
while (count != 0){
|
||
r1[0] = galois_multiply_fix(r1[0], log_y);
|
||
r1++;
|
||
count--;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 16バイト境界のバッファー専用のXOR
|
||
void galois_align_xor(
|
||
unsigned char *r1, // Region to multiply
|
||
unsigned char *r2, // Products go here
|
||
unsigned int len) // Byte length
|
||
{
|
||
#ifndef NO_SIMD
|
||
__m128i xmm0, xmm1; // 16バイトごとに XOR する
|
||
|
||
while (len != 0){
|
||
xmm0 = _mm_load_si128((__m128i *)r1);
|
||
xmm1 = _mm_load_si128((__m128i *)r2);
|
||
xmm1 = _mm_xor_si128(xmm1, xmm0);
|
||
_mm_store_si128((__m128i *)r2, xmm1);
|
||
r1 += 16;
|
||
r2 += 16;
|
||
len -= 16;
|
||
}
|
||
|
||
#else // SSE2 を使わないなら
|
||
while (len != 0){ // 4バイトずつ計算する
|
||
((unsigned int *)r2)[0] ^= ((unsigned int *)r1)[0];
|
||
r1 += 4;
|
||
r2 += 4;
|
||
len -= 4;
|
||
}
|
||
#endif
|
||
}
|
||
|
||
// 16バイト境界のバッファー専用の掛け算 (ALTMAP しない)
|
||
void galois_align16_multiply(
|
||
unsigned char *r1, // Region to multiply (must be aligned by 16)
|
||
unsigned char *r2, // Products go here
|
||
unsigned int len, // Byte length (must be multiple of 16)
|
||
int factor) // Number to multiply by
|
||
{
|
||
if (factor <= 1){
|
||
if (factor == 0)
|
||
return;
|
||
|
||
#ifndef NO_SIMD
|
||
{ __m128i xmm0, xmm1; // 16バイトごとに XOR する
|
||
while (len != 0){
|
||
xmm0 = _mm_load_si128((__m128i *)r1);
|
||
xmm1 = _mm_load_si128((__m128i *)r2);
|
||
xmm1 = _mm_xor_si128(xmm1, xmm0);
|
||
_mm_store_si128((__m128i *)r2, xmm1);
|
||
r1 += 16;
|
||
r2 += 16;
|
||
len -= 16;
|
||
}
|
||
}
|
||
#else // SSE2 を使わないなら
|
||
while (len != 0){ // 4バイトずつ計算する
|
||
((unsigned int *)r2)[0] ^= ((unsigned int *)r1)[0];
|
||
r1 += 4;
|
||
r2 += 4;
|
||
len -= 4;
|
||
}
|
||
#endif
|
||
|
||
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
|
||
#ifndef NO_SIMD
|
||
/*
|
||
// sse_unit が 32の倍数な時だけ
|
||
} else if (cpu_flag & 16){ // AVX2 対応なら
|
||
__declspec( align(32) ) unsigned char small_table[128];
|
||
|
||
create_eight_table_avx2(small_table, factor);
|
||
|
||
gf16_avx2_block32u(r1, r2, len, small_table);
|
||
*/
|
||
|
||
} else if (cpu_flag & 1){ // SSSE3 対応なら
|
||
__declspec( align(16) ) unsigned char small_table[128];
|
||
|
||
create_eight_table(small_table, factor);
|
||
|
||
gf16_ssse3_block16u(r1, r2, len, small_table);
|
||
// sse_unit が 32の倍数ならこちらでもいい
|
||
//gf16_ssse3_block32u(r1, r2, len, small_table);
|
||
|
||
#endif
|
||
} else { // Combined Multi Table support (2 tables of 256-entries)
|
||
unsigned int mtab[256 * 2];
|
||
|
||
create_two_table(mtab, factor); // build combined multiplication tables
|
||
|
||
#ifndef _WIN64 // 32-bit 版なら MMX を使う
|
||
#ifndef NO_SIMD
|
||
DoBlock8(r1, r2, len, mtab); // process large chunk 8-bytes a shot
|
||
|
||
#else // MMX を使わないなら
|
||
// バッファーを 32-bit整数として扱う
|
||
while (len != 0){ // 4バイトずつ計算する
|
||
((unsigned int *)r2)[0] ^= mtab[r1[0]] ^ mtab[256 + r1[1]] ^
|
||
((mtab[r1[2]] ^ mtab[256 + r1[3]]) << 16);
|
||
r1 += 4;
|
||
r2 += 4;
|
||
len -= 4;
|
||
}
|
||
#endif
|
||
|
||
#else // 64-bit 版なら 64-bit 整数を使う
|
||
// バッファーを 64-bit整数として扱う
|
||
while (len != 0){ // 8バイトずつ計算する
|
||
((unsigned __int64 *)r2)[0] ^=
|
||
mtab[r1[0]] ^ mtab[256 + r1[1]] ^
|
||
((mtab[r1[2]] ^ mtab[256 + r1[3]]) << 16) ^
|
||
((unsigned __int64)(mtab[r1[4]] ^ mtab[256 + r1[5]]) << 32) ^
|
||
((unsigned __int64)(mtab[r1[6]] ^ mtab[256 + r1[7]]) << 48);
|
||
r1 += 8;
|
||
r2 += 8;
|
||
len -= 8;
|
||
}
|
||
#endif
|
||
}
|
||
}
|
||
|
||
// 32バイトごとに並び替えられたバッファー専用の掛け算 (SSSE3 & ALTMAP)
|
||
void galois_align32_multiply(
|
||
unsigned char *r1, // Region to multiply (must be aligned by 16)
|
||
unsigned char *r2, // Products go here
|
||
unsigned int len, // Byte length (must be multiple of 32)
|
||
int factor) // Number to multiply by
|
||
{
|
||
if (factor <= 1){
|
||
if (factor != 0){
|
||
__m128i xmm0, xmm1; // 16バイトごとに XOR する
|
||
|
||
while (len != 0){
|
||
xmm0 = _mm_load_si128((__m128i *)r1);
|
||
xmm1 = _mm_load_si128((__m128i *)r2);
|
||
xmm1 = _mm_xor_si128(xmm1, xmm0);
|
||
_mm_store_si128((__m128i *)r2, xmm1);
|
||
r1 += 16;
|
||
r2 += 16;
|
||
len -= 16;
|
||
}
|
||
}
|
||
|
||
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
|
||
} else {
|
||
__declspec( align(16) ) unsigned char small_table[128];
|
||
|
||
create_eight_table(small_table, factor);
|
||
|
||
gf16_ssse3_block32_altmap(r1, r2, len, small_table);
|
||
}
|
||
}
|
||
|
||
// 掛け算を2回行って、一度に更新する (SSSE3 & ALTMAP)
|
||
void galois_align32_multiply2(
|
||
unsigned char *src1, // Region to multiply (must be aligned by 16)
|
||
unsigned char *src2,
|
||
unsigned char *dst, // Products go here
|
||
unsigned int len, // Byte length (must be multiple of 32)
|
||
int factor1, // Number to multiply by
|
||
int factor2)
|
||
{
|
||
if ((factor1 == 1) && (factor2 == 1)){ // 両方の factor が 1の場合
|
||
__m128i xmm0, xmm1, xmm2;
|
||
|
||
while (len != 0){
|
||
xmm0 = _mm_load_si128((__m128i *)dst);
|
||
xmm1 = _mm_load_si128((__m128i *)src1);
|
||
xmm2 = _mm_load_si128((__m128i *)src2);
|
||
xmm0 = _mm_xor_si128(xmm0, xmm1);
|
||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||
_mm_store_si128((__m128i *)dst, xmm0);
|
||
src1 += 16;
|
||
src2 += 16;
|
||
dst += 16;
|
||
len -= 16;
|
||
}
|
||
|
||
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
|
||
} else {
|
||
__declspec( align(16) ) unsigned char small_table[256];
|
||
|
||
create_eight_table(small_table, factor1);
|
||
create_eight_table(small_table + 128, factor2);
|
||
|
||
gf16_ssse3_block32_altmap2(src1, src2, dst, len, small_table);
|
||
}
|
||
}
|
||
|
||
// 256バイトごとに並び替えられたバッファー専用の JIT(SSE2) を使った掛け算
|
||
void galois_align256_multiply(
|
||
unsigned char *r1, // Region to multiply (must be aligned by 16)
|
||
unsigned char *r2, // Products go here
|
||
unsigned int len, // Byte length (must be multiple of 32)
|
||
int factor) // Number to multiply by
|
||
{
|
||
if (factor <= 1){
|
||
if (factor != 0){
|
||
__m128i xmm0, xmm1; // 16バイトごとに XOR する
|
||
|
||
while (len != 0){
|
||
xmm0 = _mm_load_si128((__m128i *)r1);
|
||
xmm1 = _mm_load_si128((__m128i *)r2);
|
||
xmm1 = _mm_xor_si128(xmm1, xmm0);
|
||
_mm_store_si128((__m128i *)r2, xmm1);
|
||
r1 += 16;
|
||
r2 += 16;
|
||
len -= 16;
|
||
}
|
||
}
|
||
|
||
// 常に JIT(SSE2) を使う
|
||
} else {
|
||
gf16_sse2_block256_jit(r1, r2, len, factor);
|
||
}
|
||
}
|
||
|
||
// 32バイトごとに並び替えられたバッファー専用の掛け算 (AVX2 & ALTMAP)
|
||
void galois_align32avx_multiply(
|
||
unsigned char *r1, // Region to multiply (must be aligned by 32)
|
||
unsigned char *r2, // Products go here
|
||
unsigned int len, // Byte length (must be multiple of 32)
|
||
int factor) // Number to multiply by
|
||
{
|
||
if (factor <= 1){
|
||
if (factor != 0){
|
||
__m256i ymm0, ymm1; // 32バイトごとに XOR する
|
||
|
||
while (len != 0){
|
||
ymm0 = _mm256_load_si256((__m256i *)r1);
|
||
ymm1 = _mm256_load_si256((__m256i *)r2);
|
||
ymm1 = _mm256_xor_si256(ymm1, ymm0);
|
||
_mm256_store_si256((__m256i *)r2, ymm1);
|
||
r1 += 32;
|
||
r2 += 32;
|
||
len -= 32;
|
||
}
|
||
|
||
_mm256_zeroupper(); // AVX-SSE 切り替えの回避
|
||
}
|
||
|
||
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
|
||
} else {
|
||
__declspec( align(32) ) unsigned char small_table[128];
|
||
|
||
create_eight_table_avx2(small_table, factor);
|
||
|
||
gf16_avx2_block32(r1, r2, len, small_table);
|
||
}
|
||
}
|
||
|
||
// 掛け算を2回行って、一度に更新する (AVX2 & ALTMAP)
|
||
void galois_align32avx_multiply2(
|
||
unsigned char *src1, // Region to multiply (must be aligned by 32)
|
||
unsigned char *src2,
|
||
unsigned char *dst, // Products go here
|
||
unsigned int len, // Byte length (must be multiple of 32)
|
||
int factor1, // Number to multiply by
|
||
int factor2)
|
||
{
|
||
if ((factor1 == 1) && (factor2 == 1)){ // 両方の factor が 1の場合
|
||
__m256i ymm0, ymm1, ymm2;
|
||
while (len != 0){
|
||
ymm0 = _mm256_load_si256((__m256i *)dst);
|
||
ymm1 = _mm256_load_si256((__m256i *)src1);
|
||
ymm2 = _mm256_load_si256((__m256i *)src2);
|
||
ymm0 = _mm256_xor_si256(ymm0, ymm1);
|
||
ymm0 = _mm256_xor_si256(ymm0, ymm2);
|
||
_mm256_store_si256((__m256i *)dst, ymm0);
|
||
src1 += 32;
|
||
src2 += 32;
|
||
dst += 32;
|
||
len -= 32;
|
||
}
|
||
_mm256_zeroupper(); // AVX-SSE 切り替えの回避
|
||
|
||
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
|
||
} else {
|
||
__declspec( align(32) ) unsigned char small_table[256];
|
||
|
||
create_eight_table_avx2(small_table, factor1);
|
||
create_eight_table_avx2(small_table + 128, factor2);
|
||
|
||
gf16_avx2_block32_2(src1, src2, dst, len, small_table);
|
||
}
|
||
}
|
||
|
||
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
||
// チェックサムを計算する
|
||
|
||
// buffer alignment must be 16, length must be multiple of 16
|
||
void checksum16(unsigned char *data, unsigned char *hash, int byte_size)
|
||
{
|
||
int i, count;
|
||
|
||
#ifndef NO_SIMD // SSE2 を使うなら
|
||
__m128i temp16, prev16, data16, mask16, zero16, poly16;
|
||
|
||
count = byte_size / 16;
|
||
prev16 = _mm_setzero_si128();
|
||
zero16 = _mm_setzero_si128();
|
||
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
|
||
|
||
while (count > 0){ // HASH_RANGE バイトごとに
|
||
// 16バイトごとに XOR する
|
||
temp16 = _mm_setzero_si128();
|
||
|
||
if (count < HASH_RANGE / 16){
|
||
i = count;
|
||
count = 0;
|
||
} else {
|
||
i = HASH_RANGE / 16;
|
||
count -= HASH_RANGE / 16;
|
||
}
|
||
while (i > 0){
|
||
data16 = _mm_load_si128((__m128i *)data); // load 16-bytes
|
||
temp16 = _mm_xor_si128(temp16, data16);
|
||
data += 16;
|
||
i--;
|
||
}
|
||
|
||
// 前回の値を 2倍して、今回の値を追加する
|
||
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
|
||
mask16 = _mm_cmpgt_epi16(zero16, prev16); // (0 > prev) ? 0xFFFF : 0x0000
|
||
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
|
||
mask16 = _mm_and_si128(mask16, poly16); // 0x100B or 0x0000
|
||
prev16 = _mm_xor_si128(prev16, mask16);
|
||
|
||
prev16 = _mm_xor_si128(prev16, temp16);
|
||
}
|
||
_mm_store_si128((__m128i *)hash, prev16);
|
||
|
||
#else // 4バイト整数で計算するなら
|
||
unsigned int *data4, temp[4], prev[4], x, mask;
|
||
|
||
data4 = (unsigned int *)data;
|
||
count = byte_size / 4;
|
||
for (i = 0; i < 4; i++)
|
||
prev[i] = 0;
|
||
|
||
while (count > 0){ // HASH_RANGE バイトごとに
|
||
// 4バイトごとに 16バイトに XOR する
|
||
for (i = 0; i < 4; i++)
|
||
temp[i] = 0;
|
||
for (i = 0; i < HASH_RANGE / 4; i++){
|
||
temp[i & 3] ^= data4[i];
|
||
count--;
|
||
if (count == 0)
|
||
break;
|
||
}
|
||
data4 += HASH_RANGE / 4;
|
||
|
||
// 前回の値を 2倍して、今回の値を追加する
|
||
for (i = 0; i < 4; i++){
|
||
x = prev[i];
|
||
mask = (x & 0x80008000) >> 15; // 0x00010001 or 0x00000000
|
||
x = (x & 0x7FFF7FFF) << 1; // 3倍する場合は「^=」にすればいい
|
||
x ^= mask * 0x100B; // 0x100B100B or 0x00000000
|
||
|
||
prev[i] = x ^ temp[i];
|
||
}
|
||
}
|
||
for (i = 0; i < 4; i++)
|
||
((unsigned int *)hash)[i] = prev[i];
|
||
#endif
|
||
}
|
||
|
||
// チェックサムを計算すると同時にデータを並び替える
|
||
// buffer alignment must be 16, length must be (multiple of 32) - 16
|
||
void checksum16_altmap32(unsigned char *data, unsigned char *hash, int byte_size)
|
||
{
|
||
int i, count;
|
||
__m128i temp16, prev16, mask16, zero16, poly16;
|
||
__m128i dataA, dataB, dataC, dataD, maskB;
|
||
|
||
count = byte_size / 16;
|
||
prev16 = _mm_setzero_si128();
|
||
zero16 = _mm_setzero_si128();
|
||
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
|
||
maskB = _mm_set1_epi16(0x00FF); // 0x00FF *8
|
||
|
||
while (count > 0){ // HASH_RANGE バイトごとに
|
||
// 16バイトごとに XOR する
|
||
temp16 = _mm_setzero_si128();
|
||
|
||
if (count < HASH_RANGE / 16){
|
||
i = count;
|
||
count = 0;
|
||
} else {
|
||
i = HASH_RANGE / 16;
|
||
count -= HASH_RANGE / 16;
|
||
}
|
||
while (i >= 2){
|
||
dataA = _mm_load_si128((__m128i *)data); // read 32-bytes
|
||
dataB = _mm_load_si128((__m128i *)data + 1);
|
||
temp16 = _mm_xor_si128(temp16, dataA);
|
||
temp16 = _mm_xor_si128(temp16, dataB);
|
||
|
||
dataC = _mm_and_si128(dataA, maskB); // erase higher byte
|
||
dataD = _mm_and_si128(dataB, maskB);
|
||
dataA = _mm_srli_epi16(dataA, 8); // move higher byte to lower
|
||
dataB = _mm_srli_epi16(dataB, 8);
|
||
dataC = _mm_packus_epi16(dataC, dataD); // select lower byte of each word
|
||
dataA = _mm_packus_epi16(dataA, dataB); // select higher byte of each word
|
||
|
||
_mm_store_si128((__m128i *)data, dataC); // write 32-bytes
|
||
_mm_store_si128((__m128i *)data + 1, dataA);
|
||
|
||
data += 32;
|
||
i -= 2;
|
||
}
|
||
if (i > 0){
|
||
dataA = _mm_load_si128((__m128i *)data); // load 16-bytes
|
||
temp16 = _mm_xor_si128(temp16, dataA);
|
||
}
|
||
|
||
// 前回の値を 2倍して、今回の値を追加する
|
||
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
|
||
mask16 = _mm_cmpgt_epi16(zero16, prev16); // (0 > prev) ? 0xFFFF : 0x0000
|
||
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
|
||
mask16 = _mm_and_si128(mask16, poly16); // 0x100B or 0x0000
|
||
prev16 = _mm_xor_si128(prev16, mask16);
|
||
|
||
prev16 = _mm_xor_si128(prev16, temp16);
|
||
}
|
||
if (hash != data + 16) // ハッシュ値の保存先が別なら
|
||
_mm_store_si128((__m128i *)hash, prev16);
|
||
|
||
// 最後にハッシュ値も並び替える
|
||
dataC = _mm_and_si128(dataA, maskB); // erase higher byte
|
||
dataD = _mm_and_si128(prev16, maskB);
|
||
dataA = _mm_srli_epi16(dataA, 8); // move higher byte to lower
|
||
dataB = _mm_srli_epi16(prev16, 8);
|
||
dataC = _mm_packus_epi16(dataC, dataD); // select lower byte of each word
|
||
dataA = _mm_packus_epi16(dataA, dataB); // select higher byte of each word
|
||
|
||
_mm_store_si128((__m128i *)data, dataC); // write 32-bytes
|
||
_mm_store_si128((__m128i *)data + 1, dataA);
|
||
}
|
||
|
||
// データの並びを元に戻すと同時にチェックサムを計算する
|
||
// buffer alignment must be 32, length must be (multiple of 32) - 16
|
||
void checksum16_return32(unsigned char *data, unsigned char *hash, int byte_size)
|
||
{
|
||
int i, count;
|
||
__m128i temp16, prev16, mask16, zero16, poly16;
|
||
__m128i dataA, dataB, dataC;
|
||
|
||
count = byte_size / 16;
|
||
prev16 = _mm_setzero_si128();
|
||
zero16 = _mm_setzero_si128();
|
||
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
|
||
|
||
while (count > 0){ // HASH_RANGE バイトごとに
|
||
// 16バイトごとに XOR する
|
||
temp16 = _mm_setzero_si128();
|
||
|
||
if (count < HASH_RANGE / 16){
|
||
i = count;
|
||
count = 0;
|
||
} else {
|
||
i = HASH_RANGE / 16;
|
||
count -= HASH_RANGE / 16;
|
||
}
|
||
while (i >= 2){
|
||
dataA = _mm_load_si128((__m128i *)data); // read 32-bytes
|
||
dataB = _mm_load_si128((__m128i *)data + 1);
|
||
|
||
dataC = _mm_unpacklo_epi8(dataA, dataB); // interleave lower and higher bytes
|
||
dataA = _mm_unpackhi_epi8(dataA, dataB);
|
||
|
||
_mm_store_si128((__m128i *)data, dataC); // write 32-bytes
|
||
_mm_store_si128((__m128i *)data + 1, dataA);
|
||
temp16 = _mm_xor_si128(temp16, dataC);
|
||
temp16 = _mm_xor_si128(temp16, dataA);
|
||
|
||
data += 32;
|
||
i -= 2;
|
||
}
|
||
if (i > 0){
|
||
dataA = _mm_load_si128((__m128i *)data); // read 32-bytes
|
||
dataB = _mm_load_si128((__m128i *)data + 1);
|
||
|
||
dataC = _mm_unpacklo_epi8(dataA, dataB); // interleave lower and higher bytes
|
||
dataA = _mm_unpackhi_epi8(dataA, dataB);
|
||
|
||
_mm_store_si128((__m128i *)data, dataC); // write 32-bytes
|
||
_mm_store_si128((__m128i *)data + 1, dataA);
|
||
temp16 = _mm_xor_si128(temp16, dataC);
|
||
}
|
||
|
||
// 前回の値を 2倍して、今回の値を追加する
|
||
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
|
||
mask16 = _mm_cmpgt_epi16(zero16, prev16); // (0 > prev) ? 0xFFFF : 0x0000
|
||
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
|
||
mask16 = _mm_and_si128(mask16, poly16); // 0x100B or 0x0000
|
||
prev16 = _mm_xor_si128(prev16, mask16);
|
||
|
||
prev16 = _mm_xor_si128(prev16, temp16);
|
||
}
|
||
|
||
if (hash != data + 16) // ハッシュ値の保存先が別なら
|
||
_mm_store_si128((__m128i *)hash, prev16);
|
||
}
|
||
|
||
// チェックサムを計算すると同時にデータを並び替える
|
||
// buffer alignment must be 256, length must be (multiple of 256) - 16
|
||
void checksum16_altmap256(unsigned char *data, unsigned char *hash, int byte_size)
|
||
{
|
||
// 順番に処理する場合
|
||
// checksum16(data, hash, byte_size);
|
||
// galois_altmap256_change(data, byte_size + HASH_SIZE);
|
||
|
||
unsigned short dtmp[128];
|
||
int i, j;
|
||
__m128i ta, tb, lmask, th, tl, temp16, prev16, poly16;
|
||
|
||
lmask = _mm_set1_epi16(0xff);
|
||
prev16 = _mm_setzero_si128();
|
||
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
|
||
|
||
byte_size += HASH_SIZE;
|
||
while (byte_size != 0){
|
||
for (j = 0; j < 8; j++){
|
||
ta = _mm_load_si128((__m128i *)data); // read 32-bytes
|
||
tb = _mm_load_si128((__m128i *)data + 1);
|
||
|
||
// 128バイトを XOR して 16バイトにする
|
||
if ((j & 3) == 0)
|
||
temp16 = _mm_setzero_si128();
|
||
temp16 = _mm_xor_si128(temp16, ta);
|
||
if ((j != 7) || (byte_size != 256)) // 最後の 16バイトだけ含めない
|
||
temp16 = _mm_xor_si128(temp16, tb);
|
||
|
||
if ((j & 3) == 3){ // j = 3 or 7
|
||
// 前回の値を 2倍して、今回の値を追加する
|
||
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
|
||
tl = _mm_setzero_si128();
|
||
th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000
|
||
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
|
||
th = _mm_and_si128(th, poly16); // 0x100B or 0x0000
|
||
prev16 = _mm_xor_si128(prev16, th);
|
||
|
||
prev16 = _mm_xor_si128(prev16, temp16);
|
||
if ((j == 7) && (byte_size == 256)){
|
||
if (hash != data + 16) // ハッシュ値の保存先が別なら
|
||
_mm_store_si128((__m128i *)hash, prev16);
|
||
_mm_store_si128(&tb, prev16); // 最後の 16バイトにハッシュ値を置く
|
||
}
|
||
}
|
||
|
||
// split to high/low parts
|
||
th = _mm_packus_epi16(_mm_srli_epi16(tb, 8), _mm_srli_epi16(ta, 8));
|
||
tl = _mm_packus_epi16(_mm_and_si128(tb, lmask), _mm_and_si128(ta, lmask));
|
||
|
||
// save to dest by extracting 16-bit masks
|
||
dtmp[0 + j] = _mm_movemask_epi8(th);
|
||
for (i = 1; i < 8; i++){
|
||
th = _mm_slli_epi16(th, 1); // byte shift would be nicer, but ultimately doesn't matter here
|
||
dtmp[i*8 + j] = _mm_movemask_epi8(th);
|
||
}
|
||
dtmp[64 + j] = _mm_movemask_epi8(tl);
|
||
for (i = 1; i < 8; i++){
|
||
tl = _mm_slli_epi16(tl, 1);
|
||
dtmp[64 + i*8 + j] = _mm_movemask_epi8(tl);
|
||
}
|
||
data += 32;
|
||
}
|
||
// we only really need to copy temp -> dest
|
||
memcpy(data - 256, dtmp, 256);
|
||
byte_size -= 256;
|
||
}
|
||
}
|
||
|
||
// データの並びを元に戻すと同時にチェックサムを計算する
|
||
// buffer alignment must be 256, length must be (multiple of 256) - 16
|
||
void checksum16_return256(unsigned char *data, unsigned char *hash, int byte_size)
|
||
{
|
||
// 順番に処理する場合
|
||
// galois_altmap256_return(data, byte_size + HASH_SIZE);
|
||
// checksum16(data, hash, byte_size);
|
||
|
||
__declspec( align(16) ) unsigned short dtmp[128];
|
||
int i, j;
|
||
__m128i ta, tb, lmask, th, tl, temp16, prev16, poly16;
|
||
|
||
th = _mm_setzero_si128(); // shut up compiler warning
|
||
tl = _mm_setzero_si128();
|
||
lmask = _mm_set1_epi16(0xff);
|
||
prev16 = _mm_setzero_si128();
|
||
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
|
||
|
||
byte_size += HASH_SIZE;
|
||
while (byte_size != 0){
|
||
for (j = 0; j < 8; j++){
|
||
// load in pattern: [0011223344556677] [8899AABBCCDDEEFF]
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 240), 0);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 112), 0);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 224), 1);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 96), 1);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 208), 2);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 80), 2);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 192), 3);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 64), 3);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 176), 4);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 48), 4);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 160), 5);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 32), 5);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 144), 6);
|
||
th = _mm_insert_epi16(th, *(int *)(data + 16), 6);
|
||
tl = _mm_insert_epi16(tl, *(int *)(data + 128), 7);
|
||
th = _mm_insert_epi16(th, *(int *)data, 7);
|
||
|
||
// swizzle to [0123456789ABCDEF] [0123456789ABCDEF]
|
||
ta = _mm_packus_epi16(_mm_srli_epi16(tl, 8),_mm_srli_epi16(th, 8));
|
||
tb = _mm_packus_epi16(_mm_and_si128(tl, lmask),_mm_and_si128(th, lmask));
|
||
|
||
// extract top bits
|
||
dtmp[j*16 + 7] = _mm_movemask_epi8(ta);
|
||
dtmp[j*16 + 15] = _mm_movemask_epi8(tb);
|
||
for (i = 1; i < 8; i++){
|
||
ta = _mm_slli_epi16(ta, 1);
|
||
tb = _mm_slli_epi16(tb, 1);
|
||
dtmp[j*16 + 7-i] = _mm_movemask_epi8(ta);
|
||
dtmp[j*16 + 15-i] = _mm_movemask_epi8(tb);
|
||
}
|
||
data += 2;
|
||
}
|
||
|
||
// 128バイトを XOR して 16バイトにする
|
||
temp16 = _mm_setzero_si128();
|
||
for (j = 0; j < 8; j++){
|
||
ta = _mm_load_si128((__m128i *)dtmp + j);
|
||
temp16 = _mm_xor_si128(temp16, ta);
|
||
}
|
||
// 前回の値を 2倍して、今回の値を追加する
|
||
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
|
||
tl = _mm_setzero_si128();
|
||
th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000
|
||
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
|
||
th = _mm_and_si128(th, poly16); // 0x100B or 0x0000
|
||
prev16 = _mm_xor_si128(prev16, th);
|
||
prev16 = _mm_xor_si128(prev16, temp16);
|
||
|
||
if (byte_size != 256){
|
||
// 次の 128バイトを XOR して 16バイトにする
|
||
temp16 = _mm_setzero_si128();
|
||
for (j = 8; j < 16; j++){
|
||
ta = _mm_load_si128((__m128i *)dtmp + j);
|
||
temp16 = _mm_xor_si128(temp16, ta);
|
||
}
|
||
// 前回の値を 2倍して、今回の値を追加する
|
||
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
|
||
tl = _mm_setzero_si128();
|
||
th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000
|
||
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
|
||
th = _mm_and_si128(th, poly16); // 0x100B or 0x0000
|
||
prev16 = _mm_xor_si128(prev16, th);
|
||
prev16 = _mm_xor_si128(prev16, temp16);
|
||
|
||
} else { // 最後の 128バイトは特別
|
||
// 次の 112バイトを XOR して 16バイトにする
|
||
temp16 = _mm_setzero_si128();
|
||
for (j = 8; j < 15; j++){
|
||
ta = _mm_load_si128((__m128i *)dtmp + j);
|
||
temp16 = _mm_xor_si128(temp16, ta);
|
||
}
|
||
// 前回の値を 2倍して、今回の値を追加する
|
||
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
|
||
tl = _mm_setzero_si128();
|
||
th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000
|
||
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
|
||
th = _mm_and_si128(th, poly16); // 0x100B or 0x0000
|
||
prev16 = _mm_xor_si128(prev16, th);
|
||
prev16 = _mm_xor_si128(prev16, temp16);
|
||
}
|
||
|
||
// we only really need to copy temp -> dest
|
||
memcpy(data - 16, dtmp, 256);
|
||
data += 240;
|
||
byte_size -= 256;
|
||
}
|
||
|
||
if (hash != data - 16) // ハッシュ値の保存先が別なら
|
||
_mm_store_si128((__m128i *)hash, prev16);
|
||
}
|
||
|