// The basic of 16-bit Galois Field arithmetic is based on Galois.c by James S. Plank. // Modified by Yutaka Sawada to support MMX, SSE2, and SSSE3. /* Galois.c * James S. Plank Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques Revision 1.2A May 24, 2011 James S. Plank Department of Electrical Engineering and Computer Science University of Tennessee Knoxville, TN 37996 plank@cs.utk.edu Copyright (c) 2011, James S. Plank All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name of the University of Tennessee nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define _WIN32_WINNT 0x0601 // Windows 7 or later #include #include #include #include // 組み込み関数(intrinsic)を使用する場合インクルード #include "gf16.h" #include "gf_jit.h" // ParPar の JIT コード用 extern unsigned int cpu_flag; // declared in common2.h #ifndef _WIN64 // 32-bit 版なら #pragma warning(disable:4731) // inhibit VC's "ebp modified" warning #pragma warning(disable:4799) // inhibit VC's "missing emms" warning #endif /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // CPU によって使う関数を変更する際の仮宣言 //#define NO_SIMD // SIMD を使わない場合 int sse_unit; void galois_align16_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor); void galois_align32_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor); void galois_align32avx_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor); void galois_align256_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor); void galois_align32_multiply2(unsigned char *src1, unsigned char *src2, unsigned char *dst, unsigned int len, int factor1, int factor2); void galois_align32avx_multiply2(unsigned char *src1, unsigned char *src2, unsigned char *dst, unsigned int len, int factor1, int factor2); void galois_altmap_none(unsigned char *data, unsigned int bsize); // AVX2 と SSSE3 の ALTMAP は 32バイト単位で行う void galois_altmap32_change(unsigned char *data, unsigned int bsize); void galois_altmap32_return(unsigned char *data, unsigned int bsize); void checksum16_altmap32(unsigned char *data, unsigned char *hash, int byte_size); void checksum16_return32(unsigned char *data, unsigned char *hash, int byte_size); // JIT(SSE2) は 256バイト単位で計算する void galois_altmap256_change(unsigned char *data, unsigned int bsize); void galois_altmap256_return(unsigned char *data, unsigned int bsize); void checksum16_altmap256(unsigned char *data, unsigned char *hash, int byte_size); void checksum16_return256(unsigned char *data, unsigned char *hash, int byte_size); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #define NW 65536 #define NWM1 65535 #define PRIM_POLY 0x1100B // なぜかテーブルは 2バイト整数を使った方が速い static unsigned short *galois_log_table = NULL; static unsigned short *galois_exp_table; int galois_create_table(void) { unsigned int j, b; if (galois_log_table != NULL) return 0; galois_log_table = _aligned_malloc(sizeof(unsigned short) * NW * 2, 64); if (galois_log_table == NULL) return -1; galois_exp_table = galois_log_table + NW; // 要素数は 65536個 b = 1; for (j = 0; j < NWM1; j++){ galois_log_table[b] = (unsigned short)j; galois_exp_table[j] = (unsigned short)b; b = b << 1; if (b & NW) b ^= PRIM_POLY; } galois_exp_table[NWM1] = galois_exp_table[0]; // copy for reduction (? mod NWM1) // CPU によって使う関数を変更する sse_unit = 16; // 16, 32, 64, 128 のどれでもいい (32のSSSE3は少し速い、GPUが識別するのに注意) galois_align_multiply = galois_align16_multiply; galois_align_multiply2 = NULL; galois_altmap_change = galois_altmap_none; galois_altmap_return = galois_altmap_none; checksum16_altmap = checksum16; checksum16_return = checksum16; #ifndef NO_SIMD if (cpu_flag & 256){ // AVX2, SSSE3, JIT(SSE2) の並び替えを使わない場合 // 将来的には AVX-512 などの命令に対応してもいい //printf("\nWithout ALTMAP\n"); //sse_unit = 32; } else if (cpu_flag & 16){ // AVX2 対応なら //printf("\nUse AVX2 & ALTMAP\n"); sse_unit = 32; // 32, 64, 128 のどれでもいい galois_align_multiply = galois_align32avx_multiply; galois_align_multiply2 = galois_align32avx_multiply2; galois_altmap_change = galois_altmap32_change; galois_altmap_return = galois_altmap32_return; checksum16_altmap = checksum16_altmap32; checksum16_return = checksum16_return32; } else if (cpu_flag & 1){ // SSSE3 対応なら //printf("\nUse SSSE3 & ALTMAP\n"); sse_unit = 32; // 32, 64, 128 のどれでもいい galois_align_multiply = galois_align32_multiply; galois_align_multiply2 = galois_align32_multiply2; galois_altmap_change = galois_altmap32_change; galois_altmap_return = galois_altmap32_return; checksum16_altmap = checksum16_altmap32; checksum16_return = checksum16_return32; } else { // SSSE3 が利用できない場合 if ((cpu_flag & 128) && (jit_alloc() == 0)){ // JIT(SSE2) を使う //printf("\nUse JIT(SSE2) & ALTMAP\n"); sse_unit = 256; galois_align_multiply = galois_align256_multiply; galois_align_multiply2 = NULL; galois_altmap_change = galois_altmap256_change; galois_altmap_return = galois_altmap256_return; checksum16_altmap = checksum16_altmap256; checksum16_return = checksum16_return256; } } #endif return 0; } unsigned short galois_multiply(int x, int y) { int sum; if ((x == 0) || (y == 0)) return 0; sum = galois_log_table[x] + galois_log_table[y]; // result is from 2 to NWM1 * 2 //if (sum >= NWM1) sum -= NWM1; sum = (sum >> 16) + (sum & NWM1); // result is from 0 to NWM1 return galois_exp_table[sum]; } // multiply when "y" is a fixed value unsigned short galois_multiply_fix(int x, int log_y) { int sum; if (x == 0) return 0; sum = galois_log_table[x] + log_y; // result is from 2 to NWM1 * 2 sum = (sum >> 16) + (sum & NWM1); // result is from 0 to NWM1 return galois_exp_table[sum]; } unsigned short galois_divide(int x, int y) { int sum; if (y == 0) return NWM1; // 除算エラー if (x == 0) return 0; sum = galois_log_table[x] - galois_log_table[y]; if (sum < 0) sum += NWM1; return galois_exp_table[sum]; } // ガロア体上での乗数計算、x の y 乗 unsigned short galois_power(int x, int y) { unsigned int sum; if (x == 0) return 0; // 0**y = 0 if (y == 0) return 1; // x**0 = 1 if (y == 1) return (unsigned short)x; // x**1 = x sum = (unsigned int)(galois_log_table[x]) * (unsigned int)y; // result is from 1 to NWM1 * NWM1 //sum = sum % NWM1; sum = (sum >> 16) + (sum & NWM1); // result is from 1 to NWM1 * 2 sum = (sum >> 16) + (sum & NWM1); // result is from 0 to NWM1 return galois_exp_table[sum]; } // ガロア体上での逆数、1 / x unsigned short galois_reciprocal(int x) { if (x == 0) return NWM1; // 除算エラー return galois_exp_table[NWM1 - (int)(galois_log_table[x])]; } void galois_free_table(void) // テーブルを解放するために追加 { if (galois_log_table != NULL){ _aligned_free(galois_log_table); galois_log_table = NULL; #ifndef _WIN64 // 32-bit 版ならインライン・アセンブラを使う if (((cpu_flag & 1) == 0) && ((cpu_flag & 128) == 0)) // SSSE3 を使わない場合、MMX の終了処理 _mm_empty(); #endif // SSSE3 を使わない場合で、JIT(SSE2) を使った場合の終了処理 if (((cpu_flag & 1) == 0) && ((cpu_flag & 128) != 0)) jit_free(); } } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // MMX functions are based on code by Paul Houle (paulhoule.com) March 22, 2008 #ifndef _WIN64 // 32-bit 版ならインライン・アセンブラを使う // Processes block of data a multiple of 8 bytes long using SIMD (mmx) opcodes. // The amount of data to process (bsize) must be a non-zero multiple of 8. // Paul's original code was modified to calculate each 8-bytes and removed last shift by Yutaka Sawada static void DoBlock8(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned int *pMtab) { __asm { push ebp mov ebx,bsize ;bytes to process (multiple of 8) mov esi,input ;source mov edi,output ;destination mov ebp,pMtab ;combined multiplication table mov eax,[esi] ;load 1st 8 source bytes movd mm4,[esi+4] sub ebx,8 ;reduce last 8-bytes from loop add esi,ebx ;point to end of input/output add edi,ebx neg ebx ;convert byte size to count-up lp8: movzx edx,al movzx ecx,ah shr eax,16 movd mm0,[ebp+edx*4] ;order is [_][_][_][0] movd mm1,[ebp+400h+ecx*4] movzx edx,al movzx ecx,ah movd eax,mm4 movq mm4,[esi+ebx+8] ;read-ahead next 8 source bytes movd mm2,[ebp+edx*4] ;order is [_][_][_][1] movzx edx,al movq mm5,[edi+ebx] movd mm3,[ebp+400h+ecx*4] movzx ecx,ah shr eax,16 punpcklwd mm0,[ebp+edx*4] ;order is [_][_][2][0] movzx edx,al punpcklwd mm1,[ebp+400h+ecx*4] movzx ecx,ah punpcklwd mm2,[ebp+edx*4] ;order is [_][_][3][1] pxor mm1,mm0 punpcklwd mm3,[ebp+400h+ecx*4] pxor mm3,mm2 movd eax,mm4 ;prepare src bytes 3-0 for next loop punpcklwd mm1,mm3 ;order is [3][2][1][0] psrlq mm4,32 ;align src bytes 7-4 for next loop pxor mm1,mm5 movq [edi+ebx],mm1 add ebx,8 jnz lp8 ;no need to pre-read in last 8-bytes movzx edx,al movzx ecx,ah shr eax,16 movd mm0,[ebp+edx*4] ;order is [_][_][_][0] movd mm1,[ebp+400h+ecx*4] movzx edx,al movzx ecx,ah movd eax,mm4 movd mm2,[ebp+edx*4] ;order is [_][_][_][1] movzx edx,al movq mm5,[edi+ebx] movd mm3,[ebp+400h+ecx*4] movzx ecx,ah shr eax,16 punpcklwd mm0,[ebp+edx*4] ;order is [_][_][2][0] movzx edx,al punpcklwd mm1,[ebp+400h+ecx*4] movzx ecx,ah punpcklwd mm2,[ebp+edx*4] ;order is [_][_][3][1] pxor mm1,mm0 punpcklwd mm3,[ebp+400h+ecx*4] pxor mm3,mm2 punpcklwd mm1,mm3 ;order is [3][2][1][0] pxor mm1,mm5 movq [edi+ebx],mm1 pop ebp } } #endif // calculate multiplication tables instantly static void create_two_table(unsigned int *mtab, int factor){ int shift_table[8], i, j, sum; // factor * 2の乗数を計算する shift_table[0] = factor; // factor * 1 for (i = 1; i < 8; i++){ // if (factor & 0x8000){ // factor <<= 1; // factor ^= 0x1100B; // } else { // factor <<= 1; // } factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); shift_table[i] = factor; // factor * (2**i) } for (j = 0; j < 2; j++){ for (i = 0; i < 256; i += 32){ /* sum = 0; if (i & 32) sum = shift_table[5]; if (i & 64) sum ^= shift_table[6]; if (i & 128) sum ^= shift_table[7]; */ sum = shift_table[5] & ((i << 26) >> 31); sum ^= shift_table[6] & ((i << 25) >> 31); sum ^= shift_table[7] & ((i << 24) >> 31); mtab[i ] = sum; mtab[i + 1] = sum ^ shift_table[0]; mtab[i + 2] = sum ^ shift_table[1]; mtab[i + 3] = sum ^ shift_table[1] ^ shift_table[0]; mtab[i + 4] = sum ^ shift_table[2]; mtab[i + 5] = sum ^ shift_table[2] ^ shift_table[0]; mtab[i + 6] = sum ^ shift_table[2] ^ shift_table[1]; mtab[i + 7] = sum ^ shift_table[2] ^ shift_table[1] ^ shift_table[0]; mtab[i + 8] = sum ^ shift_table[3]; mtab[i + 9] = sum ^ shift_table[3] ^ shift_table[0]; mtab[i + 10] = sum ^ shift_table[3] ^ shift_table[1]; mtab[i + 11] = sum ^ shift_table[3] ^ shift_table[1] ^ shift_table[0]; mtab[i + 12] = sum ^ shift_table[3] ^ shift_table[2]; mtab[i + 13] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[0]; mtab[i + 14] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1]; mtab[i + 15] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1] ^ shift_table[0]; sum ^= shift_table[4]; mtab[i + 16] = sum; mtab[i + 17] = sum ^ shift_table[0]; mtab[i + 18] = sum ^ shift_table[1]; mtab[i + 19] = sum ^ shift_table[1] ^ shift_table[0]; mtab[i + 20] = sum ^ shift_table[2]; mtab[i + 21] = sum ^ shift_table[2] ^ shift_table[0]; mtab[i + 22] = sum ^ shift_table[2] ^ shift_table[1]; mtab[i + 23] = sum ^ shift_table[2] ^ shift_table[1] ^ shift_table[0]; mtab[i + 24] = sum ^ shift_table[3]; mtab[i + 25] = sum ^ shift_table[3] ^ shift_table[0]; mtab[i + 26] = sum ^ shift_table[3] ^ shift_table[1]; mtab[i + 27] = sum ^ shift_table[3] ^ shift_table[1] ^ shift_table[0]; mtab[i + 28] = sum ^ shift_table[3] ^ shift_table[2]; mtab[i + 29] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[0]; mtab[i + 30] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1]; mtab[i + 31] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1] ^ shift_table[0]; } for (i = 0; i < 8; i++){ factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); shift_table[i] = factor; // factor * (2**i) } mtab += 256; } } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // The method of using SSSE3 is based on Plank's papar; // "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions". /* static void create_eight_table_c(unsigned char *mtab, int factor) { int shift_table[16], i, sum; // factor * 2の乗数を計算する shift_table[0] = factor; // factor * 1 for (i = 1; i < 16; i++){ // if (factor & 0x8000){ // factor <<= 1; // factor ^= 0x1100B; // } else { // factor <<= 1; // } factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); shift_table[i] = factor; // factor * (2**i) } for (i = 0; i < 16; i += 4){ // 4-bit ごとに計算する mtab[i * 8 ] = 0; mtab[i * 8 + 16] = 0; sum = shift_table[i]; mtab[i * 8 + 1] = (unsigned char)sum; // lower 8-bit mtab[i * 8 + 17] = (unsigned char)(sum >> 8); // higher 8-bit sum = shift_table[i + 1]; mtab[i * 8 + 2] = (unsigned char)sum; mtab[i * 8 + 18] = (unsigned char)(sum >> 8); sum = sum ^ shift_table[i]; mtab[i * 8 + 3] = (unsigned char)sum; mtab[i * 8 + 19] = (unsigned char)(sum >> 8); sum = shift_table[i + 2]; mtab[i * 8 + 4] = (unsigned char)sum; mtab[i * 8 + 20] = (unsigned char)(sum >> 8); sum = sum ^ shift_table[i]; mtab[i * 8 + 5] = (unsigned char)sum; mtab[i * 8 + 21] = (unsigned char)(sum >> 8); sum = shift_table[i + 2] ^ shift_table[i + 1]; mtab[i * 8 + 6] = (unsigned char)sum; mtab[i * 8 + 22] = (unsigned char)(sum >> 8); sum = sum ^ shift_table[i]; mtab[i * 8 + 7] = (unsigned char)sum; mtab[i * 8 + 23] = (unsigned char)(sum >> 8); sum = shift_table[i + 3]; mtab[i * 8 + 8] = (unsigned char)sum; mtab[i * 8 + 24] = (unsigned char)(sum >> 8); sum = sum ^ shift_table[i]; mtab[i * 8 + 9] = (unsigned char)sum; mtab[i * 8 + 25] = (unsigned char)(sum >> 8); sum = shift_table[i + 3] ^ shift_table[i + 1]; mtab[i * 8 + 10] = (unsigned char)sum; mtab[i * 8 + 26] = (unsigned char)(sum >> 8); sum = sum ^ shift_table[i]; mtab[i * 8 + 11] = (unsigned char)sum; mtab[i * 8 + 27] = (unsigned char)(sum >> 8); sum = shift_table[i + 3] ^ shift_table[i + 2]; mtab[i * 8 + 12] = (unsigned char)sum; mtab[i * 8 + 28] = (unsigned char)(sum >> 8); sum = sum ^ shift_table[i]; mtab[i * 8 + 13] = (unsigned char)sum; mtab[i * 8 + 29] = (unsigned char)(sum >> 8); sum = shift_table[i + 3] ^ shift_table[i + 2] ^ shift_table[i + 1]; mtab[i * 8 + 14] = (unsigned char)sum; mtab[i * 8 + 30] = (unsigned char)(sum >> 8); sum = sum ^ shift_table[i]; mtab[i * 8 + 15] = (unsigned char)sum; mtab[i * 8 + 31] = (unsigned char)(sum >> 8); } } */ #ifndef _WIN64 // 32-bit 版ならインライン・アセンブラを使う // tables for split four combined multiplication static void create_eight_table(unsigned char *mtab, int factor) { __asm { // This implementation requires SSE2 mov eax, factor mov edx, mtab mov ecx, -128 ; create mask for 8-bit pcmpeqw xmm7, xmm7 ; 0xFFFF *8 psrlw xmm7, 8 ; 0x00FF *8 lp32: ; factor * 1, *2, *4, *8 movd xmm0, eax ; [_][_][_][_][_][_][_][1] pxor xmm1, xmm1 movsx ebx, ax sar ebx, 31 shl eax, 1 and ebx, 0x1100B xor eax, ebx pinsrw xmm1, eax, 1 ; [_][_][_][_][_][_][2][_] pxor xmm2, xmm2 movsx ebx, ax sar ebx, 31 shl eax, 1 and ebx, 0x1100B xor eax, ebx pinsrw xmm2, eax, 4 ; [_][_][_][4][_][_][_][_] punpcklwd xmm1, xmm1 ; [_][_][_][_][2][2][_][_] movsx ebx, ax sar ebx, 31 shl eax, 1 and ebx, 0x1100B xor eax, ebx movd xmm3, eax ; [_][_][_][_][_][_][_][8] pshuflw xmm0, xmm0, 17 ; [_][_][_][_][1][_][1][_] punpcklwd xmm3, xmm3 ; [_][_][_][_][_][_][8][8] pxor xmm0, xmm1 ; [_][_][_][_][3][2][1][_] pshufhw xmm2, xmm2, 0 ; [4][4][4][4][_][_][_][_] punpcklqdq xmm0, xmm0 ; [3][2][1][_][3][2][1][_] pshufd xmm3, xmm3, 0 ; [8][8][8][8][8][8][8][8] pxor xmm2, xmm0 ; [7][6][5][4][3][2][1][_] pxor xmm3, xmm2 ; [15][14][13][12][11][10][9][8] movdqa xmm0, xmm2 movdqa xmm1, xmm3 pand xmm0, xmm7 pand xmm1, xmm7 packuswb xmm0, xmm1 ; lower 8-bit * 16 psrlw xmm2, 8 psrlw xmm3, 8 packuswb xmm2, xmm3 ; higher 8-bit * 16 movdqa [edx+128+ecx], xmm0 movdqa [edx+144+ecx], xmm2 ; for next loop movsx ebx, ax ; move with sign of word sar ebx, 31 shl eax, 1 and ebx, 0x1100B xor eax, ebx add ecx, 32 jnz lp32 } } // VC2008 は SSSE3 をインライン・アセンブラで使える // Address (input) does not need be 16-byte aligned static void gf16_ssse3_block16u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __asm { mov ecx, input ; source mov edx, output ; destination mov eax, bsize ; bytes to process (multiple of 16) mov ebx, table ; multiplication table ; create mask for 8 entries pcmpeqw xmm7, xmm7 ; 0xFFFF *8 psrlw xmm7, 12 ; 0x000F *8 add ecx, eax ; point to end of input/output add edx, eax neg eax ; convert byte size to count-up lp16: movdqu xmm0, [ecx+eax] ; read source 16-bytes movdqa xmm2, [edx+eax] movdqa xmm3, [ebx] ; low table movdqa xmm4, [ebx+16] ; high table movdqa xmm1, xmm0 ; copy source psrlw xmm0, 4 ; prepare next 4-bit pand xmm1, xmm7 ; src & 0x000F pshufb xmm3, xmm1 ; table look-up psllw xmm1, 8 ; shift 8-bit for higher table pshufb xmm4, xmm1 movdqa xmm5, [ebx+32] ; low table movdqa xmm6, [ebx+48] ; high table pxor xmm3, xmm4 ; combine high and low pxor xmm2, xmm3 movdqa xmm1, xmm0 ; copy source psrlw xmm0, 4 ; prepare next 4-bit pand xmm1, xmm7 ; (src >> 4) & 0x000F pshufb xmm5, xmm1 ; table look-up psllw xmm1, 8 ; shift 8-bit for higher table pshufb xmm6, xmm1 movdqa xmm3, [ebx+64] ; low table movdqa xmm4, [ebx+80] ; high table pxor xmm5, xmm6 ; combine high and low pxor xmm2, xmm5 movdqa xmm1, xmm0 ; copy source psrlw xmm0, 4 ; prepare next 4-bit pand xmm1, xmm7 ; (src >> 8) & 0x000F pshufb xmm3, xmm1 ; table look-up psllw xmm1, 8 ; shift 8-bit for higher table pshufb xmm4, xmm1 movdqa xmm5, [ebx+96] ; low table movdqa xmm6, [ebx+112] ; high table pxor xmm3, xmm4 ; combine high and low pxor xmm2, xmm3 pshufb xmm5, xmm0 ; table look-up psllw xmm0, 8 ; shift 8-bit for higher table pshufb xmm6, xmm0 pxor xmm5, xmm6 ; combine high and low pxor xmm2, xmm5 movdqa [edx+eax], xmm2 add eax, 16 jnz lp16 } } // その場で 32バイトごとに並び替えて、計算後に戻す方法(50% faster than 16-byte version) // Address (input) does not need be 16-byte aligned static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __asm { mov ecx, input ; source mov edx, output ; destination mov eax, bsize ; bytes to process (multiple of 32) mov ebx, table ; multiplication table ; create mask for 16 entries pcmpeqw xmm7, xmm7 ; 0xFFFF *8 pcmpeqw xmm6, xmm6 ; 0xFFFF *8 psrlw xmm7, 12 ; 0x000F *8 psrlw xmm6, 8 ; 0x00FF *8 packuswb xmm7, xmm7 ; 0x0F *16 add ecx, eax ; point to end of input/output add edx, eax neg eax ; convert byte size to count-up lp32: movdqu xmm0, [ecx+eax ] ; read source 32-bytes movdqu xmm2, [ecx+eax+16] movdqa xmm1, xmm0 ; copy source movdqa xmm3, xmm2 pand xmm0, xmm6 ; erase higher byte pand xmm2, xmm6 psrlw xmm1, 8 ; move higher byte to lower psrlw xmm3, 8 packuswb xmm0, xmm2 ; select lower byte of each word packuswb xmm1, xmm3 ; select higher byte of each word movdqa xmm4, [ebx] ; low table movdqa xmm5, [ebx+16] ; high table movdqa xmm3, xmm0 ; copy source psrlw xmm0, 4 ; prepare next 4-bit pand xmm3, xmm7 ; src & 0x0F pand xmm0, xmm7 ; (src >> 4) & 0x0F pshufb xmm4, xmm3 ; table look-up pshufb xmm5, xmm3 movdqa xmm2, [ebx+32] ; low table movdqa xmm3, [ebx+48] ; high table pshufb xmm2, xmm0 ; table look-up pshufb xmm3, xmm0 pxor xmm4, xmm2 ; combine result pxor xmm5, xmm3 movdqa xmm2, [ebx+64] ; low table movdqa xmm3, [ebx+80] ; high table movdqa xmm0, xmm1 ; copy source psrlw xmm1, 4 ; prepare next 4-bit pand xmm0, xmm7 ; src & 0x0F pand xmm1, xmm7 ; (src >> 4) & 0x0F pshufb xmm2, xmm0 ; table look-up pshufb xmm3, xmm0 pxor xmm4, xmm2 ; combine result pxor xmm5, xmm3 movdqa xmm2, [ebx+96] ; low table movdqa xmm3, [ebx+112] ; high table pshufb xmm2, xmm1 ; table look-up pshufb xmm3, xmm1 pxor xmm4, xmm2 ; combine result pxor xmm5, xmm3 movdqa xmm0, [edx+eax] ; read dest 32-bytes movdqa xmm1, [edx+eax+16] movdqa xmm3, xmm4 ; copy result punpcklbw xmm3, xmm5 ; interleave lower and higher bytes punpckhbw xmm4, xmm5 pxor xmm0, xmm3 pxor xmm1, xmm4 movdqa [edx+eax], xmm0 ; write dest 32-bytes movdqa [edx+eax+16], xmm1 add eax, 32 jnz lp32 } } // 先に 32バイトごとに並び替えてあるデータを扱う方法 static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __asm { mov ecx, input ; source mov edx, output ; destination mov eax, bsize ; bytes to process (multiple of 32) mov ebx, table ; multiplication table ; create mask for 16 entries pcmpeqw xmm7, xmm7 ; 0xFFFF *8 psrlw xmm7, 12 ; 0x000F *8 packuswb xmm7, xmm7 ; 0x0F *16 add ecx, eax ; point to end of input/output add edx, eax neg eax ; convert byte size to count-up lp32: movdqa xmm0, [ecx+eax] ; read source 32-bytes movdqa xmm1, [ecx+eax+16] movdqa xmm4, [ebx] ; low table movdqa xmm5, [ebx+16] ; high table movdqa xmm3, xmm0 ; copy source psrlw xmm0, 4 ; prepare next 4-bit pand xmm3, xmm7 ; src & 0x0F pand xmm0, xmm7 ; (src >> 4) & 0x0F pshufb xmm4, xmm3 ; table look-up pshufb xmm5, xmm3 movdqa xmm2, [ebx+32] ; low table movdqa xmm3, [ebx+48] ; high table pshufb xmm2, xmm0 ; table look-up pshufb xmm3, xmm0 pxor xmm2, xmm4 ; combine result pxor xmm3, xmm5 movdqa xmm4, [ebx+64] ; low table movdqa xmm5, [ebx+80] ; high table movdqa xmm0, xmm1 ; copy source psrlw xmm0, 4 ; prepare next 4-bit pand xmm1, xmm7 ; src & 0x0F pand xmm0, xmm7 ; (src >> 4) & 0x0F pshufb xmm4, xmm1 ; table look-up pshufb xmm5, xmm1 pxor xmm4, xmm2 ; combine result pxor xmm5, xmm3 movdqa xmm2, [ebx+96] ; low table movdqa xmm3, [ebx+112] ; high table pshufb xmm2, xmm0 ; table look-up pshufb xmm3, xmm0 movdqa xmm0, [edx+eax] ; read dest 32-bytes movdqa xmm1, [edx+eax+16] pxor xmm4, xmm2 ; combine result pxor xmm5, xmm3 pxor xmm4, xmm0 pxor xmm5, xmm1 movdqa [edx+eax], xmm4 ; write dest 32-bytes movdqa [edx+eax+16], xmm5 add eax, 32 jnz lp32 } } #else // 64-bit 版ではインライン・アセンブラを使えない // (__m128i *) で逐次ポインターをキャスト変換するよりも、 // 先に __m128i* で定義しておいた方が、連続した領域へのアクセス最適化がうまくいく? // ほとんど変わらない気がする(むしろ遅い?)・・・コンパイラ次第なのかも // tables for split four combined multiplication static void create_eight_table(unsigned char *mtab, int factor) { int count = 4; __m128i *tbl; __m128i xmm0, xmm1, xmm2, xmm3, mask; tbl = (__m128i *)mtab; // create mask for 8-bit mask = _mm_setzero_si128(); mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8 mask = _mm_srli_epi16(mask, 8); // 0x00FF *8 while (1){ xmm0 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][1] xmm1 = _mm_setzero_si128(); factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); xmm1 = _mm_insert_epi16(xmm1, factor, 1); // [_][_][_][_][_][_][2][_] xmm2 = _mm_setzero_si128(); factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); xmm2 = _mm_insert_epi16(xmm2, factor, 4); // [_][_][_][4][_][_][_][_] xmm1 = _mm_unpacklo_epi16(xmm1, xmm1); // [_][_][_][_][2][2][_][_] factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); xmm3 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][8] xmm0 = _mm_shufflelo_epi16(xmm0, _MM_SHUFFLE(0, 1, 0, 1)); // [_][_][_][_][1][_][1][_] xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); // [_][_][_][_][_][_][8][8] xmm0 = _mm_xor_si128(xmm0, xmm1); // [_][_][_][_][3][2][1][_] xmm2 = _mm_shufflehi_epi16(xmm2, _MM_SHUFFLE(0, 0, 0, 0)); // [4][4][4][4][_][_][_][_] xmm0 = _mm_unpacklo_epi64(xmm0, xmm0); // [3][2][1][_][3][2][1][_] xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0, 0, 0, 0)); // [8][8][8][8][8][8][8][8] xmm2 = _mm_xor_si128(xmm2, xmm0); // [7][6][5][4][3][2][1][_] xmm3 = _mm_xor_si128(xmm3, xmm2); // [15][14][13][12][11][10][9][8] xmm0 = _mm_load_si128(&xmm2); xmm1 = _mm_load_si128(&xmm3); xmm0 = _mm_and_si128(xmm0, mask); xmm1 = _mm_and_si128(xmm1, mask); xmm0 = _mm_packus_epi16(xmm0, xmm1); // lower 8-bit * 16 xmm2 = _mm_srli_epi16(xmm2, 8); xmm3 = _mm_srli_epi16(xmm3, 8); xmm2 = _mm_packus_epi16(xmm2, xmm3); // higher 8-bit * 16 _mm_store_si128(tbl , xmm0); _mm_store_si128(tbl + 1, xmm2); count--; if (count == 0) break; factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); tbl += 2; } } // 16バイトごとに計算する方法、_mm_shuffle_epi8 の利用効率が悪い。 // Address (input) does not need be 16-byte aligned static void gf16_ssse3_block16u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __m128i *src, *dst, *tbl; __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; src = (__m128i *)input; dst = (__m128i *)output; tbl = (__m128i *)table; // create mask for 8 entries xmm7 = _mm_setzero_si128(); xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8 xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8 while (bsize != 0){ xmm0 = _mm_loadu_si128(src); // read source 16-bytes xmm2 = _mm_load_si128(dst); xmm3 = _mm_load_si128(tbl); // low table xmm4 = _mm_load_si128(tbl + 1); // high table xmm1 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x000F xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table xmm4 = _mm_shuffle_epi8(xmm4, xmm1); xmm5 = _mm_load_si128(tbl + 2); // low table xmm6 = _mm_load_si128(tbl + 3); // high table xmm3 = _mm_xor_si128(xmm3, xmm4); // combine high and low xmm2 = _mm_xor_si128(xmm2, xmm3); xmm1 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x000F xmm5 = _mm_shuffle_epi8(xmm5, xmm1); // table look-up xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table xmm6 = _mm_shuffle_epi8(xmm6, xmm1); xmm3 = _mm_load_si128(tbl + 4); // low table xmm4 = _mm_load_si128(tbl + 5); // high table xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low xmm2 = _mm_xor_si128(xmm2, xmm5); xmm1 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x000F xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table xmm4 = _mm_shuffle_epi8(xmm4, xmm1); xmm5 = _mm_load_si128(tbl + 6); // low table xmm6 = _mm_load_si128(tbl + 7); // high table xmm3 = _mm_xor_si128(xmm3, xmm4); // combine high and low xmm2 = _mm_xor_si128(xmm2, xmm3); xmm5 = _mm_shuffle_epi8(xmm5, xmm0); // table look-up xmm0 = _mm_slli_epi16(xmm0, 8); // shift 8-bit for higher table xmm6 = _mm_shuffle_epi8(xmm6, xmm0); xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low xmm2 = _mm_xor_si128(xmm2, xmm5); _mm_store_si128(dst, xmm2); src += 1; dst += 1; bsize -= 16; } } // その場で 32バイトごとに並び替えて、計算後に戻す方法(50% faster than 16-byte version) // なぜか asm を使わない方が速い!? 32-bit と 64-bit の両方で使える // xmm レジスタを 8個までしか使わない方が 32-bit 版で速いし安定する // Address (input) does not need be 16-byte aligned static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; __m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7; // copy tables to local tbl0 = _mm_load_si128((__m128i *)table); tbl1 = _mm_load_si128((__m128i *)table + 1); tbl2 = _mm_load_si128((__m128i *)table + 2); tbl3 = _mm_load_si128((__m128i *)table + 3); tbl4 = _mm_load_si128((__m128i *)table + 4); tbl5 = _mm_load_si128((__m128i *)table + 5); tbl6 = _mm_load_si128((__m128i *)table + 6); tbl7 = _mm_load_si128((__m128i *)table + 7); // create mask for 16 entries xmm7 = _mm_setzero_si128(); xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8 xmm6 = _mm_srli_epi16(xmm7, 8); // 0x00FF *8 xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8 xmm7 = _mm_packus_epi16(xmm7, xmm7); // 0x0F *16 while (bsize != 0){ xmm1 = _mm_loadu_si128((__m128i *)input); // read source 32-bytes xmm3 = _mm_loadu_si128((__m128i *)input + 1); xmm0 = _mm_and_si128(xmm1, xmm6); // erase higher byte xmm2 = _mm_and_si128(xmm3, xmm6); xmm1 = _mm_srli_epi16(xmm1, 8); // move higher byte to lower xmm3 = _mm_srli_epi16(xmm3, 8); xmm0 = _mm_packus_epi16(xmm0, xmm2); // select lower byte of each word xmm1 = _mm_packus_epi16(xmm1, xmm3); // select higher byte of each word xmm4 = _mm_load_si128(&tbl0); // load tables xmm5 = _mm_load_si128(&tbl1); xmm3 = _mm_and_si128(xmm0, xmm7); // src & 0x0F xmm0 = _mm_and_si128(_mm_srli_epi16(xmm0, 4), xmm7); // (src >> 4) & 0x0F xmm4 = _mm_shuffle_epi8(xmm4, xmm3); // table look-up xmm5 = _mm_shuffle_epi8(xmm5, xmm3); xmm2 = _mm_load_si128(&tbl2); // load tables xmm3 = _mm_load_si128(&tbl3); xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm2 = _mm_load_si128(&tbl4); // load tables xmm3 = _mm_load_si128(&tbl5); xmm0 = _mm_and_si128(xmm1, xmm7); // src & 0x0F xmm1 = _mm_and_si128(_mm_srli_epi16(xmm1, 4), xmm7); // (src >> 4) & 0x0F xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm2 = _mm_load_si128(&tbl6); // load tables xmm3 = _mm_load_si128(&tbl7); xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm1); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm0 = _mm_load_si128((__m128i *)output); // read dest 32-bytes xmm1 = _mm_load_si128((__m128i *)output + 1); xmm3 = _mm_unpacklo_epi8(xmm4, xmm5); // interleave lower and higher bytes xmm4 = _mm_unpackhi_epi8(xmm4, xmm5); xmm0 = _mm_xor_si128(xmm0, xmm3); xmm1 = _mm_xor_si128(xmm1, xmm4); _mm_store_si128((__m128i *)output, xmm0); // write dest 32-bytes _mm_store_si128((__m128i *)output + 1, xmm1); input += 32; output += 32; bsize -= 32; } } // xmm レジスタにテーブルを読み込む方が 64-bit 版で微妙に速い static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7; __m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7; // copy tables to local tbl0 = _mm_load_si128((__m128i *)table); tbl1 = _mm_load_si128((__m128i *)table + 1); tbl2 = _mm_load_si128((__m128i *)table + 2); tbl3 = _mm_load_si128((__m128i *)table + 3); tbl4 = _mm_load_si128((__m128i *)table + 4); tbl5 = _mm_load_si128((__m128i *)table + 5); tbl6 = _mm_load_si128((__m128i *)table + 6); tbl7 = _mm_load_si128((__m128i *)table + 7); // create mask for 16 entries xmm7 = _mm_setzero_si128(); xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8 xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8 xmm7 = _mm_packus_epi16(xmm7, xmm7); // 0x0F *16 while (bsize != 0){ xmm0 = _mm_load_si128((__m128i *)input); // read source 32-bytes xmm1 = _mm_load_si128((__m128i *)input + 1); xmm3 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm3 = _mm_and_si128(xmm3, xmm7); // src & 0x0F xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F xmm4 = _mm_load_si128(&tbl0); // load tables xmm5 = _mm_load_si128(&tbl1); xmm4 = _mm_shuffle_epi8(xmm4, xmm3); // table look-up xmm5 = _mm_shuffle_epi8(xmm5, xmm3); xmm2 = _mm_load_si128(&tbl2); // load tables xmm3 = _mm_load_si128(&tbl3); xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm0 = _mm_load_si128(&xmm1); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x0F xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F xmm2 = _mm_load_si128(&tbl4); // load tables xmm3 = _mm_load_si128(&tbl5); xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm1); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm2 = _mm_load_si128(&tbl6); // load tables xmm3 = _mm_load_si128(&tbl7); xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm0 = _mm_load_si128((__m128i *)output); // read dest 32-bytes xmm1 = _mm_load_si128((__m128i *)output + 1); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm4 = _mm_xor_si128(xmm4, xmm0); xmm5 = _mm_xor_si128(xmm5, xmm1); _mm_store_si128((__m128i *)output, xmm4); // write dest 32-bytes _mm_store_si128((__m128i *)output + 1, xmm5); input += 32; output += 32; bsize -= 32; } } /* static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __m128i *src, *dst, *tbl; __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7; src = (__m128i *)input; dst = (__m128i *)output; tbl = (__m128i *)table; // create mask for 16 entries xmm7 = _mm_setzero_si128(); xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8 xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8 xmm7 = _mm_packus_epi16(xmm7, xmm7); // 0x0F *16 while (bsize != 0){ xmm0 = _mm_load_si128(src); // read source 32-bytes xmm1 = _mm_load_si128(src + 1); xmm3 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm3 = _mm_and_si128(xmm3, xmm7); // src & 0x0F xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F xmm4 = _mm_load_si128(tbl); // load tables xmm5 = _mm_load_si128(tbl + 1); xmm4 = _mm_shuffle_epi8(xmm4, xmm3); // table look-up xmm5 = _mm_shuffle_epi8(xmm5, xmm3); xmm2 = _mm_load_si128(tbl + 2); // load tables xmm3 = _mm_load_si128(tbl + 3); xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm0 = _mm_load_si128(&xmm1); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x0F xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F xmm2 = _mm_load_si128(tbl + 4); // load tables xmm3 = _mm_load_si128(tbl + 5); xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm1); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm2 = _mm_load_si128(tbl + 6); // load tables xmm3 = _mm_load_si128(tbl + 7); xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm0 = _mm_load_si128(dst); // read dest 32-bytes xmm1 = _mm_load_si128(dst + 1); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm4 = _mm_xor_si128(xmm4, xmm0); xmm5 = _mm_xor_si128(xmm5, xmm1); _mm_store_si128(dst, xmm4); // write dest 32-bytes _mm_store_si128(dst + 1, xmm5); src += 2; dst += 2; bsize -= 32; } } */ #endif // 逆行列計算用に掛け算だけする(XORで追加しない) static void gf16_ssse3_block16s(unsigned char *data, unsigned int bsize, unsigned char *table) { __m128i dest, mask, xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; __m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7; // copy tables to local tbl0 = _mm_load_si128((__m128i *)table); tbl1 = _mm_load_si128((__m128i *)table + 1); tbl2 = _mm_load_si128((__m128i *)table + 2); tbl3 = _mm_load_si128((__m128i *)table + 3); tbl4 = _mm_load_si128((__m128i *)table + 4); tbl5 = _mm_load_si128((__m128i *)table + 5); tbl6 = _mm_load_si128((__m128i *)table + 6); tbl7 = _mm_load_si128((__m128i *)table + 7); // create mask for 8 entries mask = _mm_setzero_si128(); mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8 mask = _mm_srli_epi16(mask, 12); // 0x000F *8 while (bsize != 0){ xmm0 = _mm_load_si128((__m128i *)data); // read source 16-bytes xmm3 = _mm_load_si128(&tbl0); // low table xmm4 = _mm_load_si128(&tbl1); // high table xmm1 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, mask); // src & 0x000F xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table xmm4 = _mm_shuffle_epi8(xmm4, xmm1); xmm5 = _mm_load_si128(&tbl2); // low table xmm6 = _mm_load_si128(&tbl3); // high table dest = _mm_xor_si128(xmm3, xmm4); // combine high and low xmm1 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, mask); // src & 0x000F xmm5 = _mm_shuffle_epi8(xmm5, xmm1); // table look-up xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table xmm6 = _mm_shuffle_epi8(xmm6, xmm1); xmm3 = _mm_load_si128(&tbl4); // low table xmm4 = _mm_load_si128(&tbl5); // high table xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low dest = _mm_xor_si128(dest, xmm5); xmm1 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, mask); // src & 0x000F xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table xmm4 = _mm_shuffle_epi8(xmm4, xmm1); xmm5 = _mm_load_si128(&tbl6); // low table xmm6 = _mm_load_si128(&tbl7); // high table xmm3 = _mm_xor_si128(xmm3, xmm4); // combine high and low dest = _mm_xor_si128(dest, xmm3); xmm5 = _mm_shuffle_epi8(xmm5, xmm0); // table look-up xmm0 = _mm_slli_epi16(xmm0, 8); // shift 8-bit for higher table xmm6 = _mm_shuffle_epi8(xmm6, xmm0); xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low dest = _mm_xor_si128(dest, xmm5); _mm_store_si128((__m128i *)data, dest); data += 16; bsize -= 16; } } // 2ブロック同時に計算することで、メモリーへのアクセス回数を減らす // 128バイトのテーブルを2個用意しておくこと // xmm レジスタの数が足りないので、テーブルを毎回ロードする static void gf16_ssse3_block32_altmap2(unsigned char *input1, unsigned char *input2, unsigned char *output, unsigned int bsize, unsigned char *table) { __m128i *tbl; __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, mask; tbl = (__m128i *)table; // create mask for 16 entries mask = _mm_setzero_si128(); mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8 mask = _mm_srli_epi16(mask, 12); // 0x000F *8 mask = _mm_packus_epi16(mask, mask); // 0x0F *16 while (bsize != 0){ xmm0 = _mm_load_si128((__m128i *)input1); // read source 32-bytes xmm1 = _mm_load_si128((__m128i *)input1 + 1); xmm6 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm6 = _mm_and_si128(xmm6, mask); // src & 0x0F xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F xmm4 = _mm_load_si128(tbl); // load tables xmm5 = _mm_load_si128(tbl + 1); xmm4 = _mm_shuffle_epi8(xmm4, xmm6); // table look-up xmm5 = _mm_shuffle_epi8(xmm5, xmm6); xmm2 = _mm_load_si128(tbl + 2); // load tables xmm3 = _mm_load_si128(tbl + 3); xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm0 = _mm_load_si128(&xmm1); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, mask); // src & 0x0F xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F xmm2 = _mm_load_si128(tbl + 4); // load tables xmm3 = _mm_load_si128(tbl + 5); xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm1); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm2 = _mm_load_si128(tbl + 6); // load tables xmm3 = _mm_load_si128(tbl + 7); xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm0 = _mm_load_si128((__m128i *)input2); // read source 32-bytes xmm1 = _mm_load_si128((__m128i *)input2 + 1); xmm6 = _mm_load_si128(&xmm0); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm6 = _mm_and_si128(xmm6, mask); // src & 0x0F xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F xmm2 = _mm_load_si128(tbl + 8); // load tables xmm3 = _mm_load_si128(tbl + 9); xmm2 = _mm_shuffle_epi8(xmm2, xmm6); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm6); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm2 = _mm_load_si128(tbl + 10); // load tables xmm3 = _mm_load_si128(tbl + 11); xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm0 = _mm_load_si128(&xmm1); // copy source xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit xmm1 = _mm_and_si128(xmm1, mask); // src & 0x0F xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F xmm2 = _mm_load_si128(tbl + 12); // load tables xmm3 = _mm_load_si128(tbl + 13); xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm1); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm2 = _mm_load_si128(tbl + 14); // load tables xmm3 = _mm_load_si128(tbl + 15); xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up xmm3 = _mm_shuffle_epi8(xmm3, xmm0); xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result xmm5 = _mm_xor_si128(xmm5, xmm3); xmm0 = _mm_load_si128((__m128i *)output); // read dest 32-bytes xmm1 = _mm_load_si128((__m128i *)output + 1); xmm0 = _mm_xor_si128(xmm0, xmm4); xmm1 = _mm_xor_si128(xmm1, xmm5); _mm_store_si128((__m128i *)output, xmm0); // write dest 32-bytes _mm_store_si128((__m128i *)output + 1, xmm1); input1 += 32; input2 += 32; output += 32; bsize -= 32; } } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // AVX2 命令を使うには Windows 7 以降じゃないといけない // _mm256_permute2x128_si256 の control の意味は以下を参照 // http://www.felixcloutier.com/x86/VPERM2I128.html // AVX2 を使って全体を2倍していくと、13% ぐらい速くなる // でも、テーブル作成が少し速くなっても、全体的な速度はほとんど変わらない・・・ static void create_eight_table_avx2(unsigned char *mtab, int factor) { int count; __m128i xmm0, xmm1, xmm2, xmm3, mask8; __m256i ymm0, ymm1, ymm2, ymm3, base, poly, mask16; // create mask for 8-bit mask8 = _mm_setzero_si128(); mask8 = _mm_cmpeq_epi16(mask8, mask8); // 0xFFFF *8 mask8 = _mm_srli_epi16(mask8, 8); // 0x00FF *8 xmm0 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][1] xmm1 = _mm_setzero_si128(); factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); xmm1 = _mm_insert_epi16(xmm1, factor, 1); // [_][_][_][_][_][_][2][_] xmm2 = _mm_setzero_si128(); factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); xmm2 = _mm_insert_epi16(xmm2, factor, 4); // [_][_][_][4][_][_][_][_] xmm1 = _mm_unpacklo_epi16(xmm1, xmm1); // [_][_][_][_][2][2][_][_] factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B); xmm3 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][8] xmm0 = _mm_shufflelo_epi16(xmm0, _MM_SHUFFLE(0, 1, 0, 1)); // [_][_][_][_][1][_][1][_] xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); // [_][_][_][_][_][_][8][8] xmm0 = _mm_xor_si128(xmm0, xmm1); // [_][_][_][_][3][2][1][_] xmm2 = _mm_shufflehi_epi16(xmm2, _MM_SHUFFLE(0, 0, 0, 0)); // [4][4][4][4][_][_][_][_] xmm0 = _mm_unpacklo_epi64(xmm0, xmm0); // [3][2][1][_][3][2][1][_] xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0, 0, 0, 0)); // [8][8][8][8][8][8][8][8] xmm2 = _mm_xor_si128(xmm2, xmm0); // [7][6][5][4][3][2][1][_] xmm3 = _mm_xor_si128(xmm3, xmm2); // [15][14][13][12][11][10][9][8] // 途中で AVX2 命令を使っても遅くならないっぽい poly = _mm256_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B * 16 mask16 = _mm256_cmpeq_epi16(poly, poly); mask16 = _mm256_srli_epi16(mask16, 8); // 0x00FF *16 base = _mm256_setzero_si256(); base = _mm256_inserti128_si256(base, xmm2, 0); base = _mm256_inserti128_si256(base, xmm3, 1); // ymm レジスタに読み込んでる間にメモリーに書き込んだ方が速い xmm0 = _mm_and_si128(xmm2, mask8); xmm1 = _mm_and_si128(xmm3, mask8); xmm0 = _mm_packus_epi16(xmm0, xmm1); // lower 8-bit * 16 xmm2 = _mm_srli_epi16(xmm2, 8); xmm3 = _mm_srli_epi16(xmm3, 8); xmm2 = _mm_packus_epi16(xmm2, xmm3); // higher 8-bit * 16 _mm_store_si128((__m128i *)mtab , xmm0); _mm_store_si128((__m128i *)mtab + 1, xmm2); for (count = 1; count < 4; count++){ // 全体を2倍する ymm0 = _mm256_slli_epi16(base, 1); ymm1 = _mm256_srai_epi16(base, 15); ymm1 = _mm256_and_si256(ymm1, poly); base = _mm256_xor_si256(ymm1, ymm0); // 全体を2倍する ymm0 = _mm256_slli_epi16(base, 1); ymm1 = _mm256_srai_epi16(base, 15); ymm1 = _mm256_and_si256(ymm1, poly); base = _mm256_xor_si256(ymm1, ymm0); // 全体を2倍する ymm0 = _mm256_slli_epi16(base, 1); ymm1 = _mm256_srai_epi16(base, 15); ymm1 = _mm256_and_si256(ymm1, poly); base = _mm256_xor_si256(ymm1, ymm0); // 全体を2倍する ymm0 = _mm256_slli_epi16(base, 1); ymm1 = _mm256_srai_epi16(base, 15); ymm1 = _mm256_and_si256(ymm1, poly); base = _mm256_xor_si256(ymm1, ymm0); // 並び替えて保存する ymm0 = _mm256_and_si256(base, mask16); // lower 8-bit * 16 ymm1 = _mm256_srli_epi16(base, 8); // higher 8-bit * 16 ymm2 = _mm256_permute2x128_si256(ymm0, ymm1, 0x20); ymm3 = _mm256_permute2x128_si256(ymm0, ymm1, 0x31); ymm0 = _mm256_packus_epi16(ymm2, ymm3); _mm256_store_si256((__m256i *)mtab + count, ymm0); } // AVX-SSE 切り替えの回避 _mm256_zeroupper(); } // 逆行列計算用に掛け算だけする(XORで追加しない) static void gf16_avx2_block32s(unsigned char *data, unsigned int bsize, unsigned char *table) { __m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7; __m256i mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3; // copy tables to local tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo] tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo] tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi] tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi] // split to 8 tables tbl0 = _mm256_permute2x128_si256(tmp0, tmp0, 0x00); // tbl0[low0][low0] tbl1 = _mm256_permute2x128_si256(tmp1, tmp1, 0x00); // tbl1[low1][low1] tbl2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x00); // tbl2[low2][low2] tbl3 = _mm256_permute2x128_si256(tmp3, tmp3, 0x00); // tbl3[low3][low3] tbl4 = _mm256_permute2x128_si256(tmp0, tmp0, 0x11); // tbl0[high0][high0] tbl5 = _mm256_permute2x128_si256(tmp1, tmp1, 0x11); // tbl1[high1][high1] tbl6 = _mm256_permute2x128_si256(tmp2, tmp2, 0x11); // tbl2[high2][high2] tbl7 = _mm256_permute2x128_si256(tmp3, tmp3, 0x11); // tbl3[high3][high3] // create mask for 16 entries mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16 mask = _mm256_srli_epi16(mask, 12); // 0x000F *16 while (bsize != 0){ src0 = _mm256_load_si256((__m256i *)data); // read source 32-bytes src1 = _mm256_and_si256(src0, mask); // src & 0x0F tmp0 = _mm256_shuffle_epi8(tbl0, src1); // table look-up src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table tmp1 = _mm256_shuffle_epi8(tbl4, src1); src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit dest = _mm256_xor_si256(tmp0, tmp1); // combine high and low src1 = _mm256_and_si256(src0, mask); // src & 0x0F tmp0 = _mm256_shuffle_epi8(tbl1, src1); // table look-up src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table tmp1 = _mm256_shuffle_epi8(tbl5, src1); dest = _mm256_xor_si256(dest, tmp0); // combine high and low src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit dest = _mm256_xor_si256(dest, tmp1); src1 = _mm256_and_si256(src0, mask); // src & 0x0F tmp0 = _mm256_shuffle_epi8(tbl2, src1); // table look-up src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table tmp1 = _mm256_shuffle_epi8(tbl6, src1); dest = _mm256_xor_si256(dest, tmp0); // combine high and low src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit dest = _mm256_xor_si256(dest, tmp1); src1 = _mm256_and_si256(src0, mask); // src & 0x0F tmp0 = _mm256_shuffle_epi8(tbl3, src1); // table look-up src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table tmp1 = _mm256_shuffle_epi8(tbl7, src1); dest = _mm256_xor_si256(dest, tmp0); // combine high and low dest = _mm256_xor_si256(dest, tmp1); _mm256_store_si256((__m256i *)data, dest); // write dest 32-bytes data += 32; bsize -= 32; } // AVX-SSE 切り替えの回避 _mm256_zeroupper(); } // 逆行列計算用に ALTMAP されてないソースにも対応しておく // Address (input) does not need be 32-byte aligned static void gf16_avx2_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7; __m256i mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3; // copy tables to local tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo] tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo] tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi] tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi] // split to 8 tables tbl0 = _mm256_permute2x128_si256(tmp0, tmp0, 0x00); // tbl0[low0][low0] tbl1 = _mm256_permute2x128_si256(tmp1, tmp1, 0x00); // tbl1[low1][low1] tbl2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x00); // tbl2[low2][low2] tbl3 = _mm256_permute2x128_si256(tmp3, tmp3, 0x00); // tbl3[low3][low3] tbl4 = _mm256_permute2x128_si256(tmp0, tmp0, 0x11); // tbl0[high0][high0] tbl5 = _mm256_permute2x128_si256(tmp1, tmp1, 0x11); // tbl1[high1][high1] tbl6 = _mm256_permute2x128_si256(tmp2, tmp2, 0x11); // tbl2[high2][high2] tbl7 = _mm256_permute2x128_si256(tmp3, tmp3, 0x11); // tbl3[high3][high3] // create mask for 16 entries mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16 mask = _mm256_srli_epi16(mask, 12); // 0x000F *16 while (bsize != 0){ src0 = _mm256_loadu_si256((__m256i *)input); // read source 32-bytes dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes src1 = _mm256_and_si256(src0, mask); // src & 0x0F tmp0 = _mm256_shuffle_epi8(tbl0, src1); // table look-up src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table tmp1 = _mm256_shuffle_epi8(tbl4, src1); dest = _mm256_xor_si256(dest, tmp0); // combine high and low src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit dest = _mm256_xor_si256(dest, tmp1); src1 = _mm256_and_si256(src0, mask); // src & 0x0F tmp0 = _mm256_shuffle_epi8(tbl1, src1); // table look-up src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table tmp1 = _mm256_shuffle_epi8(tbl5, src1); dest = _mm256_xor_si256(dest, tmp0); // combine high and low src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit dest = _mm256_xor_si256(dest, tmp1); src1 = _mm256_and_si256(src0, mask); // src & 0x0F tmp0 = _mm256_shuffle_epi8(tbl2, src1); // table look-up src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table tmp1 = _mm256_shuffle_epi8(tbl6, src1); dest = _mm256_xor_si256(dest, tmp0); // combine high and low src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit dest = _mm256_xor_si256(dest, tmp1); src1 = _mm256_and_si256(src0, mask); // src & 0x0F tmp0 = _mm256_shuffle_epi8(tbl3, src1); // table look-up src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table tmp1 = _mm256_shuffle_epi8(tbl7, src1); dest = _mm256_xor_si256(dest, tmp0); // combine high and low dest = _mm256_xor_si256(dest, tmp1); _mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes input += 32; output += 32; bsize -= 32; } // AVX-SSE 切り替えの回避 _mm256_zeroupper(); } // テーブルを並び替えて使えば、ループ内の並び替え回数を一回に減らせる static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3; // copy tables to local tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo] tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo] tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi] tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi] // re-arrange table order (permute より blend の方が速いらしい) tbl0 = _mm256_blend_epi32(tmp0, tmp2, 0xF0); // tbl0[low0][high2] <- 0x0f[lo][hi] tbl1 = _mm256_blend_epi32(tmp1, tmp3, 0xF0); // tbl1[low1][high3] <- 0xf0[lo][hi] tbl2 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03); // tbl2[high0][low2] <- 0x0f[lo][hi] tbl3 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03); // tbl3[high1][low3] <- 0xf0[lo][hi] // create mask for 32 entries mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16 mask = _mm256_srli_epi16(mask, 12); // 0x000F *16 mask = _mm256_packus_epi16(mask, mask); // 0x0F *32 while (bsize != 0){ src0 = _mm256_load_si256((__m256i *)input); // read source 32-bytes src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit src0 = _mm256_and_si256(src0, mask); // src & 0x0F src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F tmp0 = _mm256_shuffle_epi8(tbl0, src0); // table look-up tmp1 = _mm256_shuffle_epi8(tbl1, src1); tmp2 = _mm256_shuffle_epi8(tbl2, src0); tmp3 = _mm256_shuffle_epi8(tbl3, src1); tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result tmp2 = _mm256_xor_si256(tmp2, tmp3); tmp2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x01); // exchange low & high 128-bit dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes tmp0 = _mm256_xor_si256(tmp0, tmp2); dest = _mm256_xor_si256(dest, tmp0); _mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes input += 32; output += 32; bsize -= 32; } // AVX-SSE 切り替えの回避 _mm256_zeroupper(); } /* // テーブルを並び替えて使えば、ループ内の並び替え回数を減らせる static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, src2, src3; // copy tables to local src0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo] src1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo] src2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi] src3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi] // re-arrange table order tbl0 = _mm256_permute2x128_si256(src0, src2, 0x30); // tblA[low0][high2] <- 0x0f[lo][hi] tbl1 = _mm256_permute2x128_si256(src1, src3, 0x30); // tblB[low1][high3] <- 0xf0[lo][hi] tbl2 = _mm256_permute2x128_si256(src2, src0, 0x30); // tblC[low2][high0] <- 0x0f[hi][lo] tbl3 = _mm256_permute2x128_si256(src3, src1, 0x30); // tblD[low3][high1] <- 0xf0[hi][lo] // create mask for 32 entries mask = _mm256_set1_epi8(0x0F); // 0x0F *32 while (bsize != 0){ src0 = _mm256_load_si256((__m256i *)input); // read source 32-bytes dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit src0 = _mm256_and_si256(src0, mask); // src & 0x0F src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F src2 = _mm256_permute2x128_si256(src0, src0, 0x01); // exchange low & high 128-bit of "src & 0x0F" src3 = _mm256_permute2x128_si256(src1, src1, 0x01); // exchange low & high 128-bit of "(src >> 4) & 0x0F" src0 = _mm256_shuffle_epi8(tbl0, src0); // table look-up src1 = _mm256_shuffle_epi8(tbl1, src1); src2 = _mm256_shuffle_epi8(tbl2, src2); src3 = _mm256_shuffle_epi8(tbl3, src3); src0 = _mm256_xor_si256(src0, src1); // combine result src2 = _mm256_xor_si256(src2, src3); dest = _mm256_xor_si256(dest, src0); dest = _mm256_xor_si256(dest, src2); _mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes input += 32; output += 32; bsize -= 32; } // AVX-SSE 切り替えの回避 _mm256_zeroupper(); } */ /* // レジスタを大量に使って依存関係をなくせば並列処理できるかも? static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table) { __m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3; // copy tables to local tbl0 = _mm256_load_si256((__m256i *)table); tbl1 = _mm256_load_si256((__m256i *)table + 1); tbl2 = _mm256_load_si256((__m256i *)table + 2); tbl3 = _mm256_load_si256((__m256i *)table + 3); // create mask for 32 entries mask = _mm256_set1_epi8(0x0F); // 0x0F *32 while (bsize != 0){ src0 = _mm256_load_si256((__m256i *)input); // read source 32-bytes dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit src0 = _mm256_and_si256(src0, mask); // src & 0x0F src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F tmp0 = _mm256_permute2x128_si256(src0, src0, 0x00); // copy low 128-bit to high from "src & 0x0F" tmp1 = _mm256_permute2x128_si256(src1, src1, 0x00); // copy low 128-bit to high from "(src >> 4) & 0x0F" tmp2 = _mm256_permute2x128_si256(src0, src0, 0x11); // copy high 128-bit to low from "src & 0x0F" tmp3 = _mm256_permute2x128_si256(src1, src1, 0x11); // copy high 128-bit to low from "(src >> 4) & 0x0F" tmp0 = _mm256_shuffle_epi8(tbl0, tmp0); // table look-up tmp1 = _mm256_shuffle_epi8(tbl1, tmp1); tmp2 = _mm256_shuffle_epi8(tbl2, tmp2); tmp3 = _mm256_shuffle_epi8(tbl3, tmp3); tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result tmp2 = _mm256_xor_si256(tmp2, tmp3); dest = _mm256_xor_si256(dest, tmp0); dest = _mm256_xor_si256(dest, tmp2); _mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes input += 32; output += 32; bsize -= 32; } // AVX-SSE 切り替えの回避 _mm256_zeroupper(); } */ // 2ブロック同時に計算することで、メモリーへのアクセス回数を減らす // 128バイトのテーブルを2個用意しておくこと static void gf16_avx2_block32_2(unsigned char *input1, unsigned char *input2, unsigned char *output, unsigned int bsize, unsigned char *table) { __m256i mask, src0, src1, tmp0, tmp1, tmp2, tmp3; __m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7; // copy tables to local tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo] tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo] tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi] tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi] // re-arrange table order (permute より blend の方が速いらしい) tbl0 = _mm256_blend_epi32(tmp0, tmp2, 0xF0); // tbl0[low0][high2] <- 0x0f[lo][hi] tbl1 = _mm256_blend_epi32(tmp1, tmp3, 0xF0); // tbl1[low1][high3] <- 0xf0[lo][hi] tbl2 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03); // tbl2[high0][low2] <- 0x0f[lo][hi] tbl3 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03); // tbl3[high1][low3] <- 0xf0[lo][hi] tmp0 = _mm256_load_si256((__m256i *)table + 4); tmp1 = _mm256_load_si256((__m256i *)table + 5); tmp2 = _mm256_load_si256((__m256i *)table + 6); tmp3 = _mm256_load_si256((__m256i *)table + 7); tbl4 = _mm256_blend_epi32(tmp0, tmp2, 0xF0); tbl5 = _mm256_blend_epi32(tmp1, tmp3, 0xF0); tbl6 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03); tbl7 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03); // create mask for 32 entries mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16 mask = _mm256_srli_epi16(mask, 12); // 0x000F *16 mask = _mm256_packus_epi16(mask, mask); // 0x0F *32 while (bsize != 0){ src0 = _mm256_load_si256((__m256i *)input1); // read source 32-bytes src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit src0 = _mm256_and_si256(src0, mask); // src & 0x0F src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F tmp0 = _mm256_shuffle_epi8(tbl0, src0); // table look-up tmp1 = _mm256_shuffle_epi8(tbl1, src1); tmp2 = _mm256_shuffle_epi8(tbl2, src0); tmp3 = _mm256_shuffle_epi8(tbl3, src1); tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result tmp2 = _mm256_xor_si256(tmp2, tmp3); src0 = _mm256_load_si256((__m256i *)input2); // read source 32-bytes src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit src0 = _mm256_and_si256(src0, mask); // src & 0x0F src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F tmp1 = _mm256_shuffle_epi8(tbl4, src0); // table look-up tmp3 = _mm256_shuffle_epi8(tbl6, src0); tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result tmp2 = _mm256_xor_si256(tmp2, tmp3); tmp1 = _mm256_shuffle_epi8(tbl5, src1); // table look-up tmp3 = _mm256_shuffle_epi8(tbl7, src1); tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result tmp2 = _mm256_xor_si256(tmp2, tmp3); src0 = _mm256_load_si256((__m256i *)output); // read dest 32-bytes tmp2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x01); // exchange low & high 128-bit src0 = _mm256_xor_si256(src0, tmp0); src0 = _mm256_xor_si256(src0, tmp2); _mm256_store_si256((__m256i *)output, src0); // write dest 32-bytes input1 += 32; input2 += 32; output += 32; bsize -= 32; } // AVX-SSE 切り替えの回避 _mm256_zeroupper(); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // データを並び替えることで、メモリーアクセスを高速化する void galois_altmap32_change(unsigned char *data, unsigned int bsize) { __m128i xmm0, xmm1, xmm2, xmm3, mask; mask = _mm_setzero_si128(); mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8 mask = _mm_srli_epi16(mask, 8); // 0x00FF *8 while (bsize != 0){ xmm1 = _mm_load_si128((__m128i *)data); // read 32-bytes xmm3 = _mm_load_si128((__m128i *)data + 1); xmm0 = _mm_and_si128(xmm1, mask); // erase higher byte xmm2 = _mm_and_si128(xmm3, mask); xmm1 = _mm_srli_epi16(xmm1, 8); // move higher byte to lower xmm3 = _mm_srli_epi16(xmm3, 8); xmm0 = _mm_packus_epi16(xmm0, xmm2); // select lower byte of each word xmm1 = _mm_packus_epi16(xmm1, xmm3); // select higher byte of each word _mm_store_si128((__m128i *)data, xmm0); // write 32-bytes _mm_store_si128((__m128i *)data + 1, xmm1); data += 32; bsize -= 32; } } // データの並びを元に戻す void galois_altmap32_return(unsigned char *data, unsigned int bsize) { __m128i xmm0, xmm1, xmm2; while (bsize != 0){ xmm1 = _mm_load_si128((__m128i *)data); // read 32-bytes xmm2 = _mm_load_si128((__m128i *)data + 1); xmm0 = _mm_unpacklo_epi8(xmm1, xmm2); // interleave lower and higher bytes xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); _mm_store_si128((__m128i *)data, xmm0); // write 32-bytes _mm_store_si128((__m128i *)data + 1, xmm1); data += 32; bsize -= 32; } } // 並び替えない場合 void galois_altmap_none(unsigned char *data, unsigned int bsize) { } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ /* from ParPar; "gf_w16_additions.c" gf_w16_xor_lazy_sse_altmap_multiply_region gf_w16_xor_lazy_sse_jit_altmap_multiply_region */ // 256バイトごとにビット単位の XOR で計算する方法 // input と output の領域は重ならないようにすること static void gf16_sse2_block256(unsigned char *input, unsigned char *output, unsigned int bsize, int factor) { unsigned int i, bit; unsigned int counts[16]; uintptr_t deptable[16][16]; __m128i depmask1, depmask2, polymask1, polymask2, addvals1, addvals2; unsigned short tmp_depmask[16]; // calculate dependent bits addvals1 = _mm_set_epi16(1<< 7, 1<< 6, 1<< 5, 1<< 4, 1<< 3, 1<< 2, 1<<1, 1<<0); addvals2 = _mm_set_epi16(1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<9, 1<<8); // duplicate each bit in the polynomial 16 times polymask2 = _mm_set1_epi16(PRIM_POLY & 0xFFFF); // chop off top bit, although not really necessary polymask1 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 8, 1<< 9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15)); polymask2 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 0, 1<< 1, 1<< 2, 1<< 3, 1<< 4, 1<< 5, 1<< 6, 1<< 7)); polymask1 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask1); polymask2 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask2); if (factor & (1<<15)){ // XOR depmask1 = addvals1; depmask2 = addvals2; } else { depmask1 = _mm_setzero_si128(); depmask2 = _mm_setzero_si128(); } for (i = (1<<14); i; i >>= 1){ // rotate __m128i last = _mm_shuffle_epi32(_mm_shufflelo_epi16(depmask1, 0), 0); depmask1 = _mm_insert_epi16( _mm_srli_si128(depmask1, 2), _mm_extract_epi16(depmask2, 0), 7 ); depmask2 = _mm_srli_si128(depmask2, 2); // XOR poly depmask1 = _mm_xor_si128(depmask1, _mm_andnot_si128(polymask1, last)); depmask2 = _mm_xor_si128(depmask2, _mm_andnot_si128(polymask2, last)); if (factor & i){ // XOR depmask1 = _mm_xor_si128(depmask1, addvals1); depmask2 = _mm_xor_si128(depmask2, addvals2); } } // generate needed tables _mm_storeu_si128((__m128i*)(tmp_depmask), depmask1); _mm_storeu_si128((__m128i*)(tmp_depmask + 8), depmask2); for (bit = 0; bit < 16; bit++){ unsigned int cnt = 0; for (i = 0; i < 16; i++){ if (tmp_depmask[bit] & (1<>= 1){ // rotate __m128i last = _mm_shuffle_epi32(_mm_shufflelo_epi16(depmask1, 0), 0); depmask1 = _mm_insert_epi16( _mm_srli_si128(depmask1, 2), _mm_extract_epi16(depmask2, 0), 7 ); depmask2 = _mm_srli_si128(depmask2, 2); // XOR poly depmask1 = _mm_xor_si128(depmask1, _mm_andnot_si128(polymask1, last)); depmask2 = _mm_xor_si128(depmask2, _mm_andnot_si128(polymask2, last)); if (factor & i){ // XOR depmask1 = _mm_xor_si128(depmask1, addvals1); depmask2 = _mm_xor_si128(depmask2, addvals2); } } // attempt to remove some redundant XOR ops with a simple heuristic // heuristic: we just find common XOR elements between bit pairs { __m128i tmp1, tmp2; // first, we need to re-arrange words so that we can perform bitwise AND on neighbouring pairs // unfortunately, PACKUSDW is SSE4.1 only, so emulate it with shuffles // 01234567 -> 02461357 tmp1 = _mm_shuffle_epi32( _mm_shufflelo_epi16( _mm_shufflehi_epi16(depmask1, 0xD8), /* 0xD8 == 0b11011000 */ 0xD8 ), 0xD8 ); tmp2 = _mm_shuffle_epi32( _mm_shufflelo_epi16( _mm_shufflehi_epi16(depmask2, 0xD8), 0xD8 ), 0xD8 ); common_mask = _mm_and_si128( // [02461357, 8ACE9BDF] -> [02468ACE, 13579BDF] _mm_unpacklo_epi64(tmp1, tmp2), _mm_unpackhi_epi64(tmp1, tmp2) ); // we have the common elements between pairs, but it doesn't make sense to process a separate queue if there's only one common element (0 XORs), so eliminate those common_mask = _mm_andnot_si128(_mm_cmpeq_epi16( _mm_setzero_si128(), // "(v & (v-1)) == 0" is true if only zero/one bit is set in each word _mm_and_si128(common_mask, _mm_sub_epi16(common_mask, _mm_set1_epi16(1))) ), common_mask); // we now have a common elements mask without 1-bit words, just simply merge stuff in depmask1 = _mm_xor_si128(depmask1, _mm_unpacklo_epi16(common_mask, common_mask)); depmask2 = _mm_xor_si128(depmask2, _mm_unpackhi_epi16(common_mask, common_mask)); _mm_storeu_si128((__m128i*)common_depmask, common_mask); } _mm_storeu_si128((__m128i*)(tmp_depmask), depmask1); _mm_storeu_si128((__m128i*)(tmp_depmask + 8), depmask2); // Multi-threading だとスレッドごとに実行領域を分離しないとアクセス違反エラーが発生する thread_id = GetCurrentThreadId(); // 自分のスレッド ID を取得する for (j = 0; j < MAX_CPU; j++){ // 対応するスレッド個数は MAX_CPU 個まで if (jit_id[j] == thread_id) break; } if (j == MAX_CPU){ // 初期状態では jit_code 内は全て 0 なので jit_id も 0 だけ for (j = 0; j < MAX_CPU; j++){ if (InterlockedCompareExchange(jit_id + j, thread_id, 0) == 0) // 0と置き換えたなら break; } } jit_exec = jit_code + 4096 * j; jit_ptr = jit_exec; #ifdef _WIN64 _jit_push(&jit_ptr, BP); _jit_mov_r(&jit_ptr, BP, SP); // align pointer (avoid SP because stuff is encoded differently with it) _jit_mov_r(&jit_ptr, AX, SP); _jit_and_i(&jit_ptr, AX, 0xF); _jit_sub_r(&jit_ptr, BP, AX); // make Windows happy and save XMM6-15 registers // ideally should be done by this function, not JIT code, but MSVC has a convenient policy of no inline ASM for (i = 6; i < 16; i++) _jit_movaps_store(&jit_ptr, BP, -((int32_t)i-5)*16, (uint8_t)i); #endif // adding 128 to the destination pointer allows the register offset to be coded in 1 byte // eg: 'movdqa xmm0, [rdx+0x90]' is 8 bytes, whilst 'movdqa xmm0, [rdx-0x60]' is 5 bytes _jit_mov_i(&jit_ptr, AX, (intptr_t)input + 128); _jit_mov_i(&jit_ptr, DX, (intptr_t)output + 128); _jit_mov_i(&jit_ptr, CX, (intptr_t)output + bsize + 128); _jit_align32(&jit_ptr); pos_startloop = jit_ptr; //_jit_movaps_load(reg, xreg, offs) // (we just save a conditional by hardcoding this) #define _LD_APS(xreg, mreg, offs) \ *(int32_t*)(jit_ptr) = 0x40280F + ((xreg) <<19) + ((mreg) <<16) + (((offs)&0xFF) <<24); \ jit_ptr += 4 #define _ST_APS(mreg, offs, xreg) \ *(int32_t*)(jit_ptr) = 0x40290F + ((xreg) <<19) + ((mreg) <<16) + (((offs)&0xFF) <<24); \ jit_ptr += 4 #define _LD_APS64(xreg, mreg, offs) \ *(int64_t*)(jit_ptr) = 0x40280F44 + ((xreg-8) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \ jit_ptr += 5 #define _ST_APS64(mreg, offs, xreg) \ *(int64_t*)(jit_ptr) = 0x40290F44 + ((xreg-8) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \ jit_ptr += 5 #ifdef _WIN64 #define _LD_DQA(xreg, mreg, offs) \ *(int64_t*)(jit_ptr) = 0x406F0F66 + ((xreg) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \ jit_ptr += 5 #define _ST_DQA(mreg, offs, xreg) \ *(int64_t*)(jit_ptr) = 0x407F0F66 + ((xreg) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \ jit_ptr += 5 #else #define _LD_DQA(xreg, mreg, offs) \ *(int32_t*)(jit_ptr) = 0x406F0F66 + ((xreg) <<27) + ((mreg) <<24); \ *(jit_ptr +4) = (uint8_t)((offs)&0xFF); \ jit_ptr += 5 #define _ST_DQA(mreg, offs, xreg) \ *(int32_t*)(jit_ptr) = 0x407F0F66 + ((xreg) <<27) + ((mreg) <<24); \ *(jit_ptr +4) = (uint8_t)((offs)&0xFF); \ jit_ptr += 5 #endif #define _LD_DQA64(xreg, mreg, offs) \ *(int64_t*)(jit_ptr) = 0x406F0F4466 + ((int64_t)(xreg-8) <<35) + ((int64_t)(mreg) <<32) + ((int64_t)((offs)&0xFF) <<40); \ jit_ptr += 6 #define _ST_DQA64(mreg, offs, xreg) \ *(int64_t*)(jit_ptr) = 0x407F0F4466 + ((int64_t)(xreg-8) <<35) + ((int64_t)(mreg) <<32) + ((int64_t)((offs)&0xFF) <<40); \ jit_ptr += 6 //_jit_xorps_m(reg, AX, offs<<4); #define _XORPS_M_(reg, offs, tr) \ *(int32_t*)(jit_ptr) = (0x40570F + ((reg) << 19) + (((offs)&0xFF) <<28)) ^ (tr) #define _C_XORPS_M(reg, offs, c) \ _XORPS_M_(reg, offs, 0); \ jit_ptr += (c)<<2 #define _XORPS_M64_(reg, offs, tr) \ *(int64_t*)(jit_ptr) = (0x40570F44 + (((reg)-8) << 27) + ((int64_t)((offs)&0xFF) <<36)) ^ ((tr)<<8) #define _C_XORPS_M64(reg, offs, c) \ _XORPS_M64_(reg, offs, 0); \ jit_ptr += ((c)<<2)+(c) //_jit_pxor_m(1, AX, offs<<4); #ifdef _WIN64 #define _PXOR_M_(reg, offs, tr) \ *(int64_t*)(jit_ptr) = (0x40EF0F66 + ((reg) << 27) + ((int64_t)((offs)&0xFF) << 36)) ^ (tr) #else #define _PXOR_M_(reg, offs, tr) \ *(int32_t*)(jit_ptr) = (0x40EF0F66 + ((reg) << 27)) ^ (tr); \ *(jit_ptr +4) = (uint8_t)(((offs)&0xFF) << 4) #endif #define _PXOR_M(reg, offs) \ _PXOR_M_(reg, offs, 0); \ jit_ptr += 5 #define _C_PXOR_M(reg, offs, c) \ _PXOR_M_(reg, offs, 0); \ jit_ptr += ((c)<<2)+(c) #define _PXOR_M64_(reg, offs, tr) \ *(int64_t*)(jit_ptr) = (0x40EF0F4466 + ((int64_t)((reg)-8) << 35) + ((int64_t)((offs)&0xFF) << 44)) ^ ((tr)<<8) #define _C_PXOR_M64(reg, offs, c) \ _PXOR_M64_(reg, offs, 0); \ jit_ptr += ((c)<<2)+((c)<<1) //_jit_xorps_r(r2, r1) #define _XORPS_R_(r2, r1, tr) \ *(int32_t*)(jit_ptr) = (0xC0570F + ((r2) <<19) + ((r1) <<16)) ^ (tr) #define _XORPS_R(r2, r1) \ _XORPS_R_(r2, r1, 0); \ jit_ptr += 3 #define _C_XORPS_R(r2, r1, c) \ _XORPS_R_(r2, r1, 0); \ jit_ptr += ((c)<<1)+(c) // r2 is always < 8, r1 here is >= 8 #define _XORPS_R64_(r2, r1, tr) \ *(int32_t*)(jit_ptr) = (0xC0570F41 + ((r2) <<27) + ((r1) <<24)) ^ ((tr)<<8) #define _C_XORPS_R64(r2, r1, c) \ _XORPS_R64_(r2, r1, 0); \ jit_ptr += (c)<<2 //_jit_pxor_r(r2, r1) #define _PXOR_R_(r2, r1, tr) \ *(int32_t*)(jit_ptr) = (0xC0EF0F66 + ((r2) <<27) + ((r1) <<24)) ^ (tr) #define _PXOR_R(r2, r1) \ _PXOR_R_(r2, r1, 0); \ jit_ptr += 4 #define _C_PXOR_R(r2, r1, c) \ _PXOR_R_(r2, r1, 0); \ jit_ptr += (c)<<2 #define _PXOR_R64_(r2, r1, tr) \ *(int64_t*)(jit_ptr) = (0xC0EF0F4166 + ((int64_t)(r2) <<35) + ((int64_t)(r1) <<32)) ^ (((int64_t)tr)<<8) #define _C_PXOR_R64(r2, r1, c) \ _PXOR_R64_(r2, r1, 0); \ jit_ptr += ((c)<<2)+(c) // optimised mix of xor/mov operations #define _MOV_OR_XOR_FP_M(reg, offs, flag, c) \ _XORPS_M_(reg, offs, flag); \ flag &= (c)-1; \ jit_ptr += (c)<<2 #define _MOV_OR_XOR_FP_M64(reg, offs, flag, c) \ _XORPS_M64_(reg, offs, flag); \ flag &= (c)-1; \ jit_ptr += ((c)<<2)+(c) #define _MOV_OR_XOR_FP_INIT (0x570F ^ 0x280F) #define _MOV_OR_XOR_INT_M(reg, offs, flag, c) \ _PXOR_M_(reg, offs, flag); \ flag &= (c)-1; \ jit_ptr += ((c)<<2)+(c) #define _MOV_OR_XOR_INT_M64(reg, offs, flag, c) \ _PXOR_M64_(reg, offs, flag); \ flag &= (c)-1; \ jit_ptr += ((c)<<2)+((c)<<1) #define _MOV_OR_XOR_INT_INIT (0xEF0F00 ^ 0x6F0F00) #define _MOV_OR_XOR_R_FP(r2, r1, flag, c) \ _XORPS_R_(r2, r1, flag); \ flag &= (c)-1; \ jit_ptr += ((c)<<1)+(c) #define _MOV_OR_XOR_R64_FP(r2, r1, flag, c) \ _XORPS_R64_(r2, r1, flag); \ flag &= (c)-1; \ jit_ptr += (c)<<2 #define _MOV_OR_XOR_R_INT(r2, r1, flag, c) \ _PXOR_R_(r2, r1, flag); \ flag &= (c)-1; \ jit_ptr += (c)<<2 #define _MOV_OR_XOR_R64_INT(r2, r1, flag, c) \ _PXOR_R64_(r2, r1, flag); \ flag &= (c)-1; \ jit_ptr += ((c)<<2)+(c) // generate code #ifdef _WIN64 // preload upper 13 inputs into registers #define _XORS_FROM_MEMORY 3 for (inBit = 3; inBit < 8; inBit++){ _LD_APS(inBit, AX, (inBit-8)<<4); } for (; inBit<16; inBit++){ _LD_APS64(inBit, AX, (inBit-8)<<4); } #else // can only fit 5 in 32-bit mode :( #define _XORS_FROM_MEMORY 11 for (inBit = 3; inBit < 8; inBit++){ // despite appearances, we're actually loading the top 5, not mid 5 _LD_APS(inBit, AX, inBit<<4); } #endif for (bit = 0; bit < 16; bit += 2){ int destOffs = (int)((bit<<4)-128); FAST_U32 movC = _MOV_OR_XOR_INT_INIT; FAST_U16 mask1 = tmp_depmask[bit], mask2 = tmp_depmask[bit+1], maskC = common_depmask[bit>>1]; _LD_APS(0, DX, destOffs); _LD_DQA(1, DX, destOffs+16); for (inBit = -8; inBit < (_XORS_FROM_MEMORY-8); inBit++){ _MOV_OR_XOR_INT_M(2, inBit, movC, maskC & 1); _C_XORPS_M(0, inBit, mask1 & 1); _C_PXOR_M(1, inBit, mask2 & 1); mask1 >>= 1; mask2 >>= 1; maskC >>= 1; } // at least 5 can come from registers for (inBit = 3; inBit < 8; inBit++){ _MOV_OR_XOR_R_INT(2, inBit, (int32_t)movC, maskC & 1); _C_XORPS_R(0, inBit, mask1 & 1); _C_PXOR_R(1, inBit, mask2 & 1); mask1 >>= 1; mask2 >>= 1; maskC >>= 1; } #ifdef _WIN64 // more XORs can come from 64-bit registers for (inBit = 0; inBit < 8; inBit++){ _MOV_OR_XOR_R64_INT(2, inBit, movC, maskC & 1); _C_XORPS_R64(0, inBit, mask1 & 1); _C_PXOR_R64(1, inBit, mask2 & 1); mask1 >>= 1; mask2 >>= 1; maskC >>= 1; } #endif if (!movC){ _XORPS_R(0, 2); _PXOR_R(1, 2); // penalty? } _ST_APS(DX, destOffs, 0); _ST_DQA(DX, destOffs+16, 1); } #undef _XORS_FROM_MEMORY _jit_add_i(&jit_ptr, AX, 256); _jit_add_i(&jit_ptr, DX, 256); _jit_cmp_r(&jit_ptr, DX, CX); _jit_jcc(&jit_ptr, JL, pos_startloop); #ifdef _WIN64 for (i = 6; i < 16; i++) _jit_movaps_load(&jit_ptr, (uint8_t)i, BP, -((int32_t)i-5)*16); _jit_pop(&jit_ptr, BP); #endif _jit_ret(&jit_ptr); // exec (*(void(*)(void))jit_exec)(); } /* XOR method is based on ParPar. https://github.com/animetosho/ParPar/ */ // bit 単位で XOR するには、256バイトごとに並び替える void galois_altmap256_change(unsigned char *data, unsigned int bsize) { unsigned short dtmp[128]; int i, j; __m128i ta, tb, lmask, th, tl; lmask = _mm_set1_epi16(0xff); while (bsize != 0){ for (j = 0; j < 8; j++){ ta = _mm_load_si128((__m128i *)data); // read 32-bytes tb = _mm_load_si128((__m128i *)data + 1); // split to high/low parts th = _mm_packus_epi16(_mm_srli_epi16(tb, 8), _mm_srli_epi16(ta, 8)); tl = _mm_packus_epi16(_mm_and_si128(tb, lmask), _mm_and_si128(ta, lmask)); // save to dest by extracting 16-bit masks dtmp[0 + j] = _mm_movemask_epi8(th); for (i = 1; i < 8; i++){ th = _mm_slli_epi16(th, 1); // byte shift would be nicer, but ultimately doesn't matter here dtmp[i*8 + j] = _mm_movemask_epi8(th); } dtmp[64 + j] = _mm_movemask_epi8(tl); for (i = 1; i < 8; i++){ tl = _mm_slli_epi16(tl, 1); dtmp[64 + i*8 + j] = _mm_movemask_epi8(tl); } data += 32; } // we only really need to copy temp -> dest memcpy(data - 256, dtmp, 256); bsize -= 256; } } void galois_altmap256_return(unsigned char *data, unsigned int bsize) { unsigned short dtmp[128]; int i, j; __m128i ta, tb, lmask, th, tl; th = _mm_setzero_si128(); // shut up compiler warning tl = _mm_setzero_si128(); lmask = _mm_set1_epi16(0xff); while (bsize != 0){ for (j = 0; j < 8; j++){ // load in pattern: [0011223344556677] [8899AABBCCDDEEFF] tl = _mm_insert_epi16(tl, *(int *)(data + 240), 0); th = _mm_insert_epi16(th, *(int *)(data + 112), 0); tl = _mm_insert_epi16(tl, *(int *)(data + 224), 1); th = _mm_insert_epi16(th, *(int *)(data + 96), 1); tl = _mm_insert_epi16(tl, *(int *)(data + 208), 2); th = _mm_insert_epi16(th, *(int *)(data + 80), 2); tl = _mm_insert_epi16(tl, *(int *)(data + 192), 3); th = _mm_insert_epi16(th, *(int *)(data + 64), 3); tl = _mm_insert_epi16(tl, *(int *)(data + 176), 4); th = _mm_insert_epi16(th, *(int *)(data + 48), 4); tl = _mm_insert_epi16(tl, *(int *)(data + 160), 5); th = _mm_insert_epi16(th, *(int *)(data + 32), 5); tl = _mm_insert_epi16(tl, *(int *)(data + 144), 6); th = _mm_insert_epi16(th, *(int *)(data + 16), 6); tl = _mm_insert_epi16(tl, *(int *)(data + 128), 7); th = _mm_insert_epi16(th, *(int *)data, 7); // swizzle to [0123456789ABCDEF] [0123456789ABCDEF] ta = _mm_packus_epi16(_mm_srli_epi16(tl, 8),_mm_srli_epi16(th, 8)); tb = _mm_packus_epi16(_mm_and_si128(tl, lmask),_mm_and_si128(th, lmask)); // extract top bits dtmp[j*16 + 7] = _mm_movemask_epi8(ta); dtmp[j*16 + 15] = _mm_movemask_epi8(tb); for (i = 1; i < 8; i++){ ta = _mm_slli_epi16(ta, 1); tb = _mm_slli_epi16(tb, 1); dtmp[j*16 + 7-i] = _mm_movemask_epi8(ta); dtmp[j*16 + 15-i] = _mm_movemask_epi8(tb); } data += 2; } // we only really need to copy temp -> dest memcpy(data - 16, dtmp, 256); data += 240; bsize -= 256; } } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // 複数のテーブルを使って掛け算する void galois_region_multiply( unsigned short *r1, // Region to multiply unsigned short *r2, // Products go here unsigned int count, // Count of number in short int factor) // Number to multiply by { if (factor <= 1){ if (factor == 0) return; // アドレスが 4の倍数で無い場合は 4バイト単位で計算する効率が落ちる if ((ULONG_PTR)r2 & 2){ // そこで最初の 1個(2バイト)だけ普通に計算する *r2 ^= *r1; r1++; r2++; count--; } while (count >= 2){ // 2個(4バイト)ずつ計算する ((unsigned int *)r2)[0] ^= ((unsigned int *)r1)[0]; r1 += 2; r2 += 2; count -= 2; } if (count == 1) // 最後に余ったやつだけ普通に計算する *r2 ^= *r1; return; } if (count >= 64){ // 64バイト以上なら掛け算用のテーブルを使った方が速い #ifndef NO_SIMD if (cpu_flag & 16){ // AVX2 対応なら __declspec( align(32) ) unsigned char small_table[128]; int s, d; create_eight_table_avx2(small_table, factor); // アドレスが 32の倍数で無い場合は 32バイト単位で計算する効率が落ちる while ((ULONG_PTR)r2 & 0x1E){ // そこで最初の 1~15個(2~30バイト)だけ普通に計算する s = r1[0]; d = r2[0]; d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8); r2[0] = (unsigned short)d; r1++; r2++; count--; } // 16個ずつ計算するので 16の倍数にする gf16_avx2_block32u((unsigned char *)r1, (unsigned char *)r2, (count & 0xFFFFFFF0) << 1, small_table); r1 += count & 0xFFFFFFF0; r2 += count & 0xFFFFFFF0; count &= 15; // 残りは 1個ずつ計算する while (count != 0){ s = r1[0]; d = r2[0]; d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8); r2[0] = (unsigned short)d; r1++; r2++; count--; } } else if (cpu_flag & 1){ // SSSE3 対応なら __declspec( align(16) ) unsigned char small_table[128]; int s, d; create_eight_table(small_table, factor); // アドレスが 16の倍数で無い場合は 16バイト単位で計算する効率が落ちる while ((ULONG_PTR)r2 & 0xE){ // そこで最初の 1~7個(2~14バイト)だけ普通に計算する s = r1[0]; d = r2[0]; d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8); r2[0] = (unsigned short)d; r1++; r2++; count--; } if (sse_unit == 16){ // 8個ずつ計算するので 8の倍数にする gf16_ssse3_block16u((unsigned char *)r1, (unsigned char *)r2, (count & 0xFFFFFFF8) << 1, small_table); r1 += count & 0xFFFFFFF8; r2 += count & 0xFFFFFFF8; count &= 7; } else { // 16個ずつ計算するので 16の倍数にする gf16_ssse3_block32u((unsigned char *)r1, (unsigned char *)r2, (count & 0xFFFFFFF0) << 1, small_table); r1 += count & 0xFFFFFFF0; r2 += count & 0xFFFFFFF0; count &= 15; } // 残りは 1個ずつ計算する while (count != 0){ s = r1[0]; d = r2[0]; d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8); r2[0] = (unsigned short)d; r1++; r2++; count--; } } else { // Combined Multi Table support (2 tables of 256-entries) #endif unsigned int mtab[256 * 2]; create_two_table(mtab, factor); // build combined multiplication tables // アドレスが 8の倍数で無い場合は 8バイト単位で計算する効率が落ちる while ((ULONG_PTR)r2 & 6){ // そこで最初の 1~3個(2~6バイト)だけ普通に計算する r2[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]; r1++; r2++; count--; } #ifndef _WIN64 // 32-bit 版なら MMX を使う #ifndef NO_SIMD // 4個(8バイト)ずつ計算するので 4の倍数にする DoBlock8((unsigned char *)r1, (unsigned char *)r2, (count & 0xFFFFFFFC) << 1, mtab); r1 += count & 0xFFFFFFFC; r2 += count & 0xFFFFFFFC; count &= 3; #else // MMX を使わないなら // バッファーを 32-bit整数として扱う while (count >= 2){ // 2個(4バイト)ずつ計算する // 先に計算しておいた 2個の参照テーブルを使う ((unsigned int *)r2)[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^ ((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16); r1 += 2; r2 += 2; count -= 2; } #endif #else // 64-bit 版なら 64-bit 整数を使う // バッファーを 64-bit整数として扱う while (count >= 4){ // 4個(8バイト)ずつ計算する // 先に計算しておいた 2個の参照テーブルを使う ((unsigned __int64 *)r2)[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^ ((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16) ^ ((unsigned __int64)(mtab[((unsigned char *)r1)[4]] ^ mtab[256 + ((unsigned char *)r1)[5]]) << 32) ^ ((unsigned __int64)(mtab[((unsigned char *)r1)[6]] ^ mtab[256 + ((unsigned char *)r1)[7]]) << 48); r1 += 4; r2 += 4; count -= 4; } #endif // 残りは 1個ずつ計算する while (count != 0){ r2[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]; r1++; r2++; count--; } #ifndef NO_SIMD } #endif } else { // 小さいデータは普通に計算する int log_y = galois_log_table[factor]; while (count != 0){ r2[0] ^= galois_multiply_fix(r1[0], log_y); r1++; r2++; count--; } } } // 行列の割り算用、微妙に速いけど大差無し void galois_region_divide( unsigned short *r1, // Region to divide. products go here unsigned int count, // Count of number in short int factor) // Number to divide by { factor = galois_reciprocal(factor); // factor = 1 / factor if (count >= 64){ // 行列サイズが小さいのでテーブル作成に時間がかかって、全く速くならない・・・ /* #ifndef NO_SIMD if (cpu_flag & 16){ // AVX2 対応なら __declspec( align(32) ) unsigned char small_table[128]; int s, d; create_eight_table_avx2(small_table, factor); // アドレスが 32の倍数で無い場合は 32バイト単位で計算する効率が落ちる while ((ULONG_PTR)r1 & 0x1E){ // そこで最初の 1~15個(2~30バイト)だけ普通に計算する s = r1[0]; d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8); r1[0] = (unsigned short)d; r1++; count--; } // 16個ずつ計算するので 16の倍数にする gf16_avx2_block32s((unsigned char *)r1, (count & 0xFFFFFFF0) << 1, small_table); r1 += count & 0xFFFFFFF0; count &= 15; // 残りは 1個ずつ計算する while (count != 0){ s = r1[0]; d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8); r1[0] = (unsigned short)d; r1++; count--; } } else if (cpu_flag & 1){ // SSSE3 対応なら __declspec( align(16) ) unsigned char small_table[128]; int s, d; create_eight_table(small_table, factor); // アドレスが 16の倍数で無い場合は 16バイト単位で計算する効率が落ちる while ((ULONG_PTR)r1 & 0xE){ // そこで最初の 1~7個(2~14バイト)だけ普通に計算する s = r1[0]; d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8); r1[0] = (unsigned short)d; r1++; count--; } // 8個ずつ計算するので 8の倍数にする gf16_ssse3_block16s((unsigned char *)r1, (count & 0xFFFFFFF8) << 1, small_table); r1 += count & 0xFFFFFFF8; count &= 7; // 残りは 1個ずつ計算する while (count != 0){ s = r1[0]; d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8); s = s >> 4; d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8); r1[0] = (unsigned short)d; r1++; count--; } } else { // Combined Multi Table support (2 tables of 256-entries) #endif */ unsigned int mtab[256 * 2]; create_two_table(mtab, factor); // 掛け算用のテーブルをその場で構成する // アドレスが 4の倍数で無い場合は 4バイト単位で計算する効率が落ちる if (((ULONG_PTR)r1 & 2) != 0){ // そこで最初の 1個(2バイト)だけ普通に計算する r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]); r1++; count--; } // バッファーを 32-bit整数として扱う while (count >= 2){ // 2個(4バイト)ずつ計算する // 先に計算しておいた 2個の参照テーブルを使う ((unsigned int *)r1)[0] = mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^ ((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16); r1 += 2; count -= 2; } // 奇数なら最後に 1個余る if (count == 1) r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]); /* #ifndef NO_SIMD } #endif */ } else { // 小さいデータは普通に計算する int log_y = galois_log_table[factor]; while (count != 0){ r1[0] = galois_multiply_fix(r1[0], log_y); r1++; count--; } } } // 16バイト境界のバッファー専用のXOR void galois_align_xor( unsigned char *r1, // Region to multiply unsigned char *r2, // Products go here unsigned int len) // Byte length { #ifndef NO_SIMD __m128i xmm0, xmm1; // 16バイトごとに XOR する while (len != 0){ xmm0 = _mm_load_si128((__m128i *)r1); xmm1 = _mm_load_si128((__m128i *)r2); xmm1 = _mm_xor_si128(xmm1, xmm0); _mm_store_si128((__m128i *)r2, xmm1); r1 += 16; r2 += 16; len -= 16; } #else // SSE2 を使わないなら while (len != 0){ // 4バイトずつ計算する ((unsigned int *)r2)[0] ^= ((unsigned int *)r1)[0]; r1 += 4; r2 += 4; len -= 4; } #endif } // 16バイト境界のバッファー専用の掛け算 (ALTMAP しない) void galois_align16_multiply( unsigned char *r1, // Region to multiply (must be aligned by 16) unsigned char *r2, // Products go here unsigned int len, // Byte length (must be multiple of 16) int factor) // Number to multiply by { if (factor <= 1){ if (factor == 0) return; #ifndef NO_SIMD { __m128i xmm0, xmm1; // 16バイトごとに XOR する while (len != 0){ xmm0 = _mm_load_si128((__m128i *)r1); xmm1 = _mm_load_si128((__m128i *)r2); xmm1 = _mm_xor_si128(xmm1, xmm0); _mm_store_si128((__m128i *)r2, xmm1); r1 += 16; r2 += 16; len -= 16; } } #else // SSE2 を使わないなら while (len != 0){ // 4バイトずつ計算する ((unsigned int *)r2)[0] ^= ((unsigned int *)r1)[0]; r1 += 4; r2 += 4; len -= 4; } #endif // 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる) #ifndef NO_SIMD /* // sse_unit が 32の倍数な時だけ } else if (cpu_flag & 16){ // AVX2 対応なら __declspec( align(32) ) unsigned char small_table[128]; create_eight_table_avx2(small_table, factor); gf16_avx2_block32u(r1, r2, len, small_table); */ } else if (cpu_flag & 1){ // SSSE3 対応なら __declspec( align(16) ) unsigned char small_table[128]; create_eight_table(small_table, factor); gf16_ssse3_block16u(r1, r2, len, small_table); // sse_unit が 32の倍数ならこちらでもいい //gf16_ssse3_block32u(r1, r2, len, small_table); #endif } else { // Combined Multi Table support (2 tables of 256-entries) unsigned int mtab[256 * 2]; create_two_table(mtab, factor); // build combined multiplication tables #ifndef _WIN64 // 32-bit 版なら MMX を使う #ifndef NO_SIMD DoBlock8(r1, r2, len, mtab); // process large chunk 8-bytes a shot #else // MMX を使わないなら // バッファーを 32-bit整数として扱う while (len != 0){ // 4バイトずつ計算する ((unsigned int *)r2)[0] ^= mtab[r1[0]] ^ mtab[256 + r1[1]] ^ ((mtab[r1[2]] ^ mtab[256 + r1[3]]) << 16); r1 += 4; r2 += 4; len -= 4; } #endif #else // 64-bit 版なら 64-bit 整数を使う // バッファーを 64-bit整数として扱う while (len != 0){ // 8バイトずつ計算する ((unsigned __int64 *)r2)[0] ^= mtab[r1[0]] ^ mtab[256 + r1[1]] ^ ((mtab[r1[2]] ^ mtab[256 + r1[3]]) << 16) ^ ((unsigned __int64)(mtab[r1[4]] ^ mtab[256 + r1[5]]) << 32) ^ ((unsigned __int64)(mtab[r1[6]] ^ mtab[256 + r1[7]]) << 48); r1 += 8; r2 += 8; len -= 8; } #endif } } // 32バイトごとに並び替えられたバッファー専用の掛け算 (SSSE3 & ALTMAP) void galois_align32_multiply( unsigned char *r1, // Region to multiply (must be aligned by 16) unsigned char *r2, // Products go here unsigned int len, // Byte length (must be multiple of 32) int factor) // Number to multiply by { if (factor <= 1){ if (factor != 0){ __m128i xmm0, xmm1; // 16バイトごとに XOR する while (len != 0){ xmm0 = _mm_load_si128((__m128i *)r1); xmm1 = _mm_load_si128((__m128i *)r2); xmm1 = _mm_xor_si128(xmm1, xmm0); _mm_store_si128((__m128i *)r2, xmm1); r1 += 16; r2 += 16; len -= 16; } } // 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる) } else { __declspec( align(16) ) unsigned char small_table[128]; create_eight_table(small_table, factor); gf16_ssse3_block32_altmap(r1, r2, len, small_table); } } // 掛け算を2回行って、一度に更新する (SSSE3 & ALTMAP) void galois_align32_multiply2( unsigned char *src1, // Region to multiply (must be aligned by 16) unsigned char *src2, unsigned char *dst, // Products go here unsigned int len, // Byte length (must be multiple of 32) int factor1, // Number to multiply by int factor2) { if ((factor1 == 1) && (factor2 == 1)){ // 両方の factor が 1の場合 __m128i xmm0, xmm1, xmm2; while (len != 0){ xmm0 = _mm_load_si128((__m128i *)dst); xmm1 = _mm_load_si128((__m128i *)src1); xmm2 = _mm_load_si128((__m128i *)src2); xmm0 = _mm_xor_si128(xmm0, xmm1); xmm0 = _mm_xor_si128(xmm0, xmm2); _mm_store_si128((__m128i *)dst, xmm0); src1 += 16; src2 += 16; dst += 16; len -= 16; } // 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる) } else { __declspec( align(16) ) unsigned char small_table[256]; create_eight_table(small_table, factor1); create_eight_table(small_table + 128, factor2); gf16_ssse3_block32_altmap2(src1, src2, dst, len, small_table); } } // 256バイトごとに並び替えられたバッファー専用の JIT(SSE2) を使った掛け算 void galois_align256_multiply( unsigned char *r1, // Region to multiply (must be aligned by 16) unsigned char *r2, // Products go here unsigned int len, // Byte length (must be multiple of 32) int factor) // Number to multiply by { if (factor <= 1){ if (factor != 0){ __m128i xmm0, xmm1; // 16バイトごとに XOR する while (len != 0){ xmm0 = _mm_load_si128((__m128i *)r1); xmm1 = _mm_load_si128((__m128i *)r2); xmm1 = _mm_xor_si128(xmm1, xmm0); _mm_store_si128((__m128i *)r2, xmm1); r1 += 16; r2 += 16; len -= 16; } } // 常に JIT(SSE2) を使う } else { gf16_sse2_block256_jit(r1, r2, len, factor); } } // 32バイトごとに並び替えられたバッファー専用の掛け算 (AVX2 & ALTMAP) void galois_align32avx_multiply( unsigned char *r1, // Region to multiply (must be aligned by 32) unsigned char *r2, // Products go here unsigned int len, // Byte length (must be multiple of 32) int factor) // Number to multiply by { if (factor <= 1){ if (factor != 0){ __m256i ymm0, ymm1; // 32バイトごとに XOR する while (len != 0){ ymm0 = _mm256_load_si256((__m256i *)r1); ymm1 = _mm256_load_si256((__m256i *)r2); ymm1 = _mm256_xor_si256(ymm1, ymm0); _mm256_store_si256((__m256i *)r2, ymm1); r1 += 32; r2 += 32; len -= 32; } _mm256_zeroupper(); // AVX-SSE 切り替えの回避 } // 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる) } else { __declspec( align(32) ) unsigned char small_table[128]; create_eight_table_avx2(small_table, factor); gf16_avx2_block32(r1, r2, len, small_table); } } // 掛け算を2回行って、一度に更新する (AVX2 & ALTMAP) void galois_align32avx_multiply2( unsigned char *src1, // Region to multiply (must be aligned by 32) unsigned char *src2, unsigned char *dst, // Products go here unsigned int len, // Byte length (must be multiple of 32) int factor1, // Number to multiply by int factor2) { if ((factor1 == 1) && (factor2 == 1)){ // 両方の factor が 1の場合 __m256i ymm0, ymm1, ymm2; while (len != 0){ ymm0 = _mm256_load_si256((__m256i *)dst); ymm1 = _mm256_load_si256((__m256i *)src1); ymm2 = _mm256_load_si256((__m256i *)src2); ymm0 = _mm256_xor_si256(ymm0, ymm1); ymm0 = _mm256_xor_si256(ymm0, ymm2); _mm256_store_si256((__m256i *)dst, ymm0); src1 += 32; src2 += 32; dst += 32; len -= 32; } _mm256_zeroupper(); // AVX-SSE 切り替えの回避 // 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる) } else { __declspec( align(32) ) unsigned char small_table[256]; create_eight_table_avx2(small_table, factor1); create_eight_table_avx2(small_table + 128, factor2); gf16_avx2_block32_2(src1, src2, dst, len, small_table); } } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // チェックサムを計算する // buffer alignment must be 16, length must be multiple of 16 void checksum16(unsigned char *data, unsigned char *hash, int byte_size) { int i, count; #ifndef NO_SIMD // SSE2 を使うなら __m128i temp16, prev16, data16, mask16, zero16, poly16; count = byte_size / 16; prev16 = _mm_setzero_si128(); zero16 = _mm_setzero_si128(); poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B while (count > 0){ // HASH_RANGE バイトごとに // 16バイトごとに XOR する temp16 = _mm_setzero_si128(); if (count < HASH_RANGE / 16){ i = count; count = 0; } else { i = HASH_RANGE / 16; count -= HASH_RANGE / 16; } while (i > 0){ data16 = _mm_load_si128((__m128i *)data); // load 16-bytes temp16 = _mm_xor_si128(temp16, data16); data += 16; i--; } // 前回の値を 2倍して、今回の値を追加する //temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい mask16 = _mm_cmpgt_epi16(zero16, prev16); // (0 > prev) ? 0xFFFF : 0x0000 prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2 mask16 = _mm_and_si128(mask16, poly16); // 0x100B or 0x0000 prev16 = _mm_xor_si128(prev16, mask16); prev16 = _mm_xor_si128(prev16, temp16); } _mm_store_si128((__m128i *)hash, prev16); #else // 4バイト整数で計算するなら unsigned int *data4, temp[4], prev[4], x, mask; data4 = (unsigned int *)data; count = byte_size / 4; for (i = 0; i < 4; i++) prev[i] = 0; while (count > 0){ // HASH_RANGE バイトごとに // 4バイトごとに 16バイトに XOR する for (i = 0; i < 4; i++) temp[i] = 0; for (i = 0; i < HASH_RANGE / 4; i++){ temp[i & 3] ^= data4[i]; count--; if (count == 0) break; } data4 += HASH_RANGE / 4; // 前回の値を 2倍して、今回の値を追加する for (i = 0; i < 4; i++){ x = prev[i]; mask = (x & 0x80008000) >> 15; // 0x00010001 or 0x00000000 x = (x & 0x7FFF7FFF) << 1; // 3倍する場合は「^=」にすればいい x ^= mask * 0x100B; // 0x100B100B or 0x00000000 prev[i] = x ^ temp[i]; } } for (i = 0; i < 4; i++) ((unsigned int *)hash)[i] = prev[i]; #endif } // チェックサムを計算すると同時にデータを並び替える // buffer alignment must be 16, length must be (multiple of 32) - 16 void checksum16_altmap32(unsigned char *data, unsigned char *hash, int byte_size) { int i, count; __m128i temp16, prev16, mask16, zero16, poly16; __m128i dataA, dataB, dataC, dataD, maskB; count = byte_size / 16; prev16 = _mm_setzero_si128(); zero16 = _mm_setzero_si128(); poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B maskB = _mm_set1_epi16(0x00FF); // 0x00FF *8 while (count > 0){ // HASH_RANGE バイトごとに // 16バイトごとに XOR する temp16 = _mm_setzero_si128(); if (count < HASH_RANGE / 16){ i = count; count = 0; } else { i = HASH_RANGE / 16; count -= HASH_RANGE / 16; } while (i >= 2){ dataA = _mm_load_si128((__m128i *)data); // read 32-bytes dataB = _mm_load_si128((__m128i *)data + 1); temp16 = _mm_xor_si128(temp16, dataA); temp16 = _mm_xor_si128(temp16, dataB); dataC = _mm_and_si128(dataA, maskB); // erase higher byte dataD = _mm_and_si128(dataB, maskB); dataA = _mm_srli_epi16(dataA, 8); // move higher byte to lower dataB = _mm_srli_epi16(dataB, 8); dataC = _mm_packus_epi16(dataC, dataD); // select lower byte of each word dataA = _mm_packus_epi16(dataA, dataB); // select higher byte of each word _mm_store_si128((__m128i *)data, dataC); // write 32-bytes _mm_store_si128((__m128i *)data + 1, dataA); data += 32; i -= 2; } if (i > 0){ dataA = _mm_load_si128((__m128i *)data); // load 16-bytes temp16 = _mm_xor_si128(temp16, dataA); } // 前回の値を 2倍して、今回の値を追加する //temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい mask16 = _mm_cmpgt_epi16(zero16, prev16); // (0 > prev) ? 0xFFFF : 0x0000 prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2 mask16 = _mm_and_si128(mask16, poly16); // 0x100B or 0x0000 prev16 = _mm_xor_si128(prev16, mask16); prev16 = _mm_xor_si128(prev16, temp16); } if (hash != data + 16) // ハッシュ値の保存先が別なら _mm_store_si128((__m128i *)hash, prev16); // 最後にハッシュ値も並び替える dataC = _mm_and_si128(dataA, maskB); // erase higher byte dataD = _mm_and_si128(prev16, maskB); dataA = _mm_srli_epi16(dataA, 8); // move higher byte to lower dataB = _mm_srli_epi16(prev16, 8); dataC = _mm_packus_epi16(dataC, dataD); // select lower byte of each word dataA = _mm_packus_epi16(dataA, dataB); // select higher byte of each word _mm_store_si128((__m128i *)data, dataC); // write 32-bytes _mm_store_si128((__m128i *)data + 1, dataA); } // データの並びを元に戻すと同時にチェックサムを計算する // buffer alignment must be 32, length must be (multiple of 32) - 16 void checksum16_return32(unsigned char *data, unsigned char *hash, int byte_size) { int i, count; __m128i temp16, prev16, mask16, zero16, poly16; __m128i dataA, dataB, dataC; count = byte_size / 16; prev16 = _mm_setzero_si128(); zero16 = _mm_setzero_si128(); poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B while (count > 0){ // HASH_RANGE バイトごとに // 16バイトごとに XOR する temp16 = _mm_setzero_si128(); if (count < HASH_RANGE / 16){ i = count; count = 0; } else { i = HASH_RANGE / 16; count -= HASH_RANGE / 16; } while (i >= 2){ dataA = _mm_load_si128((__m128i *)data); // read 32-bytes dataB = _mm_load_si128((__m128i *)data + 1); dataC = _mm_unpacklo_epi8(dataA, dataB); // interleave lower and higher bytes dataA = _mm_unpackhi_epi8(dataA, dataB); _mm_store_si128((__m128i *)data, dataC); // write 32-bytes _mm_store_si128((__m128i *)data + 1, dataA); temp16 = _mm_xor_si128(temp16, dataC); temp16 = _mm_xor_si128(temp16, dataA); data += 32; i -= 2; } if (i > 0){ dataA = _mm_load_si128((__m128i *)data); // read 32-bytes dataB = _mm_load_si128((__m128i *)data + 1); dataC = _mm_unpacklo_epi8(dataA, dataB); // interleave lower and higher bytes dataA = _mm_unpackhi_epi8(dataA, dataB); _mm_store_si128((__m128i *)data, dataC); // write 32-bytes _mm_store_si128((__m128i *)data + 1, dataA); temp16 = _mm_xor_si128(temp16, dataC); } // 前回の値を 2倍して、今回の値を追加する //temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい mask16 = _mm_cmpgt_epi16(zero16, prev16); // (0 > prev) ? 0xFFFF : 0x0000 prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2 mask16 = _mm_and_si128(mask16, poly16); // 0x100B or 0x0000 prev16 = _mm_xor_si128(prev16, mask16); prev16 = _mm_xor_si128(prev16, temp16); } if (hash != data + 16) // ハッシュ値の保存先が別なら _mm_store_si128((__m128i *)hash, prev16); } // チェックサムを計算すると同時にデータを並び替える // buffer alignment must be 256, length must be (multiple of 256) - 16 void checksum16_altmap256(unsigned char *data, unsigned char *hash, int byte_size) { // 順番に処理する場合 // checksum16(data, hash, byte_size); // galois_altmap256_change(data, byte_size + HASH_SIZE); unsigned short dtmp[128]; int i, j; __m128i ta, tb, lmask, th, tl, temp16, prev16, poly16; lmask = _mm_set1_epi16(0xff); prev16 = _mm_setzero_si128(); poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B byte_size += HASH_SIZE; while (byte_size != 0){ for (j = 0; j < 8; j++){ ta = _mm_load_si128((__m128i *)data); // read 32-bytes tb = _mm_load_si128((__m128i *)data + 1); // 128バイトを XOR して 16バイトにする if ((j & 3) == 0) temp16 = _mm_setzero_si128(); temp16 = _mm_xor_si128(temp16, ta); if ((j != 7) || (byte_size != 256)) // 最後の 16バイトだけ含めない temp16 = _mm_xor_si128(temp16, tb); if ((j & 3) == 3){ // j = 3 or 7 // 前回の値を 2倍して、今回の値を追加する //temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい tl = _mm_setzero_si128(); th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000 prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2 th = _mm_and_si128(th, poly16); // 0x100B or 0x0000 prev16 = _mm_xor_si128(prev16, th); prev16 = _mm_xor_si128(prev16, temp16); if ((j == 7) && (byte_size == 256)){ if (hash != data + 16) // ハッシュ値の保存先が別なら _mm_store_si128((__m128i *)hash, prev16); _mm_store_si128(&tb, prev16); // 最後の 16バイトにハッシュ値を置く } } // split to high/low parts th = _mm_packus_epi16(_mm_srli_epi16(tb, 8), _mm_srli_epi16(ta, 8)); tl = _mm_packus_epi16(_mm_and_si128(tb, lmask), _mm_and_si128(ta, lmask)); // save to dest by extracting 16-bit masks dtmp[0 + j] = _mm_movemask_epi8(th); for (i = 1; i < 8; i++){ th = _mm_slli_epi16(th, 1); // byte shift would be nicer, but ultimately doesn't matter here dtmp[i*8 + j] = _mm_movemask_epi8(th); } dtmp[64 + j] = _mm_movemask_epi8(tl); for (i = 1; i < 8; i++){ tl = _mm_slli_epi16(tl, 1); dtmp[64 + i*8 + j] = _mm_movemask_epi8(tl); } data += 32; } // we only really need to copy temp -> dest memcpy(data - 256, dtmp, 256); byte_size -= 256; } } // データの並びを元に戻すと同時にチェックサムを計算する // buffer alignment must be 256, length must be (multiple of 256) - 16 void checksum16_return256(unsigned char *data, unsigned char *hash, int byte_size) { // 順番に処理する場合 // galois_altmap256_return(data, byte_size + HASH_SIZE); // checksum16(data, hash, byte_size); __declspec( align(16) ) unsigned short dtmp[128]; int i, j; __m128i ta, tb, lmask, th, tl, temp16, prev16, poly16; th = _mm_setzero_si128(); // shut up compiler warning tl = _mm_setzero_si128(); lmask = _mm_set1_epi16(0xff); prev16 = _mm_setzero_si128(); poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B byte_size += HASH_SIZE; while (byte_size != 0){ for (j = 0; j < 8; j++){ // load in pattern: [0011223344556677] [8899AABBCCDDEEFF] tl = _mm_insert_epi16(tl, *(int *)(data + 240), 0); th = _mm_insert_epi16(th, *(int *)(data + 112), 0); tl = _mm_insert_epi16(tl, *(int *)(data + 224), 1); th = _mm_insert_epi16(th, *(int *)(data + 96), 1); tl = _mm_insert_epi16(tl, *(int *)(data + 208), 2); th = _mm_insert_epi16(th, *(int *)(data + 80), 2); tl = _mm_insert_epi16(tl, *(int *)(data + 192), 3); th = _mm_insert_epi16(th, *(int *)(data + 64), 3); tl = _mm_insert_epi16(tl, *(int *)(data + 176), 4); th = _mm_insert_epi16(th, *(int *)(data + 48), 4); tl = _mm_insert_epi16(tl, *(int *)(data + 160), 5); th = _mm_insert_epi16(th, *(int *)(data + 32), 5); tl = _mm_insert_epi16(tl, *(int *)(data + 144), 6); th = _mm_insert_epi16(th, *(int *)(data + 16), 6); tl = _mm_insert_epi16(tl, *(int *)(data + 128), 7); th = _mm_insert_epi16(th, *(int *)data, 7); // swizzle to [0123456789ABCDEF] [0123456789ABCDEF] ta = _mm_packus_epi16(_mm_srli_epi16(tl, 8),_mm_srli_epi16(th, 8)); tb = _mm_packus_epi16(_mm_and_si128(tl, lmask),_mm_and_si128(th, lmask)); // extract top bits dtmp[j*16 + 7] = _mm_movemask_epi8(ta); dtmp[j*16 + 15] = _mm_movemask_epi8(tb); for (i = 1; i < 8; i++){ ta = _mm_slli_epi16(ta, 1); tb = _mm_slli_epi16(tb, 1); dtmp[j*16 + 7-i] = _mm_movemask_epi8(ta); dtmp[j*16 + 15-i] = _mm_movemask_epi8(tb); } data += 2; } // 128バイトを XOR して 16バイトにする temp16 = _mm_setzero_si128(); for (j = 0; j < 8; j++){ ta = _mm_load_si128((__m128i *)dtmp + j); temp16 = _mm_xor_si128(temp16, ta); } // 前回の値を 2倍して、今回の値を追加する //temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい tl = _mm_setzero_si128(); th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000 prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2 th = _mm_and_si128(th, poly16); // 0x100B or 0x0000 prev16 = _mm_xor_si128(prev16, th); prev16 = _mm_xor_si128(prev16, temp16); if (byte_size != 256){ // 次の 128バイトを XOR して 16バイトにする temp16 = _mm_setzero_si128(); for (j = 8; j < 16; j++){ ta = _mm_load_si128((__m128i *)dtmp + j); temp16 = _mm_xor_si128(temp16, ta); } // 前回の値を 2倍して、今回の値を追加する //temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい tl = _mm_setzero_si128(); th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000 prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2 th = _mm_and_si128(th, poly16); // 0x100B or 0x0000 prev16 = _mm_xor_si128(prev16, th); prev16 = _mm_xor_si128(prev16, temp16); } else { // 最後の 128バイトは特別 // 次の 112バイトを XOR して 16バイトにする temp16 = _mm_setzero_si128(); for (j = 8; j < 15; j++){ ta = _mm_load_si128((__m128i *)dtmp + j); temp16 = _mm_xor_si128(temp16, ta); } // 前回の値を 2倍して、今回の値を追加する //temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい tl = _mm_setzero_si128(); th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000 prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2 th = _mm_and_si128(th, poly16); // 0x100B or 0x0000 prev16 = _mm_xor_si128(prev16, th); prev16 = _mm_xor_si128(prev16, temp16); } // we only really need to copy temp -> dest memcpy(data - 16, dtmp, 256); data += 240; byte_size -= 256; } if (hash != data - 16) // ハッシュ値の保存先が別なら _mm_store_si128((__m128i *)hash, prev16); }