Files
MultiPar/source/par2j/crc.c
2024-11-30 13:06:17 +09:00

420 lines
14 KiB
C

// crc.c
// Copyright : 2024-11-30 Yutaka Sawada
// License : GPL
#ifndef _WIN32_WINNT
#define _WIN32_WINNT 0x0601 // Windows 7 or later
#endif
#include <stdio.h>
#include <windows.h>
#include <nmmintrin.h> // MMX ~ SSE4.2 命令セットを使用する場合インクルード
#include <wmmintrin.h> // AES, CLMUL 命令セットを使用する場合インクルード
#include "crc.h"
extern unsigned int cpu_flag; // declared in common2.h
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// CRC-32 計算用
#define CRC32_POLY 0xEDB88320 // CRC-32-IEEE 802.3 (little endian)
unsigned int crc_table[256];
unsigned int reverse_table[256]; // CRC-32 逆算用のテーブル
// CRC 計算用のテーブルを作る
void init_crc_table(void)
{
unsigned int i, j, r;
for (i = 0; i < 256; i++){ // CRC-32
r = i;
for (j = 0; j < 8; j++)
r = (r >> 1) ^ (CRC32_POLY & ~((r & 1) - 1));
crc_table[i] = r;
}
// crc の 最上位 1バイトからテーブルの位置がわかる
// まずは逆算用のテーブルを作る、テーブルを 8ビットずらして最下位に番号を入れておく
for (i = 0; i < 256; i++)
reverse_table[(crc_table[i] >> 24)] = (crc_table[i] << 8) | i;
}
// CRC-32 を更新する
unsigned int crc_update_std(unsigned int crc, unsigned char *buf, unsigned int len)
{
// 4バイト境界までは 1バイトずつ計算する
while ((len > 0) && (((ULONG_PTR)buf) & 3)){
crc = crc_table[(crc & 0xFF) ^ (*buf++)] ^ (crc >> 8);
len--;
}
// 4バイトごとに計算する
while (len >= 4){
crc ^= *((unsigned int *)buf);
crc = crc_table[crc & 0xFF] ^ (crc >> 8);
crc = crc_table[crc & 0xFF] ^ (crc >> 8);
crc = crc_table[crc & 0xFF] ^ (crc >> 8);
crc = crc_table[crc & 0xFF] ^ (crc >> 8);
len -= 4;
buf += 4;
}
// 余りは 1バイトずつ計算する
while (len--)
crc = crc_table[(crc & 0xFF) ^ (*buf++)] ^ (crc >> 8);
return crc;
}
// 内容が全て 0 のデータの CRC-32 を更新する
/*
unsigned int crc_update_zero(unsigned int crc, unsigned int len)
{
while (len--)
crc = crc_table[crc & 0xFF] ^ (crc >> 8);
return crc;
}
*/
// 内容が全て 0 のデータの CRC-32 を逆算するための関数
unsigned int crc_reverse_zero(unsigned int crc, unsigned int len)
{
// crc2 = 前の crc ^ 0xFFFFFFFF;
// crc = table[crc2 & 0xff] ^ (crc2 >> 8);
//crc ^= 0xFFFFFFFF; // 最終処理を取り消す
while (len--)
crc = reverse_table[(crc >> 24)] ^ (crc << 8);
//crc ^= 0xFFFFFFFF; // 最終処理をし直す
return crc;
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// CRC-32 with PCLMULQDQ Instruction is based on below source code.
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
// PCLMULQDQ を使って CRC-32 を更新する
unsigned int crc_update(unsigned int crc, unsigned char *buf, unsigned int len)
{
__declspec( align(16) ) unsigned int buf128[4];
unsigned int i;
__m128i crc128, data128, temp128, two_k128;
// special case; shorter than 19 bytes or miss-alignment
if (((cpu_flag & 8) == 0) || (len < 19))
return crc_update_std(crc, buf, len);
// 4バイト境界までは 1バイトずつ計算する
while (((ULONG_PTR)buf) & 3){
crc = crc_table[(crc & 0xFF) ^ (*buf++)] ^ (crc >> 8);
len--;
}
i = ((ULONG_PTR)buf) & 12;
if (i != 0){ // read first 4, 8, or 12 bytes until memory alignment
i = 16 - i; // how many bytes to read
len -= i;
i /= 4;
buf128[0] = 0;
buf128[1] = 0;
buf128[2] = 0;
buf128[3] = 0;
buf128[4 - i] ^= crc; // set initial value
while (i > 0){
buf128[4 - i] ^= *((unsigned int *)buf);
buf += 4;
i--;
}
} else { // read first 16 bytes
buf128[0] = ((unsigned int *)buf)[0];
buf128[1] = ((unsigned int *)buf)[1];
buf128[2] = ((unsigned int *)buf)[2];
buf128[3] = ((unsigned int *)buf)[3];
buf128[0] ^= crc; // set initial value
len -= 16;
buf += 16;
}
crc128 = _mm_load_si128((__m128i *)buf128);
// set two constants; K1 = 0xccaa009e, K2 = 0x1751997d0
two_k128 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e);
// per 16 bytes
while (len >= 16){
data128 = _mm_load_si128((__m128i *)buf);
temp128 = _mm_clmulepi64_si128(crc128, two_k128, 0x10);
crc128 = _mm_clmulepi64_si128(crc128, two_k128, 0x01);
data128 = _mm_xor_si128(data128, temp128);
crc128 = _mm_xor_si128(crc128, data128);
len -= 16;
buf += 16;
}
// set two constants; K5 = 0xccaa009e, K6 = 0x163cd6124
two_k128 = _mm_set_epi32(0x00000001, 0x63cd6124, 0x00000000, 0xccaa009e);
// reduce from 128-bit to 96-bit by multiplication with K5
data128 = _mm_clmulepi64_si128(crc128, two_k128, 0);
temp128 = _mm_srli_si128(crc128, 8);
data128 = _mm_xor_si128(data128, temp128);
// reduce from 96-bit to 64-bit by multiplication with K6
temp128 = _mm_slli_si128(data128, 4);
crc128 = _mm_clmulepi64_si128(temp128, two_k128, 0x10);
crc128 = _mm_xor_si128(crc128, data128);
// set two constants; K7 = 0x1f7011640, K8 = 0x1db710640
two_k128 = _mm_set_epi32(0x00000001, 0xdb710640, 0x00000001, 0xf7011640);
// Barrett Reduction from 64-bit to 32-bit
data128 = _mm_and_si128(crc128, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000));
temp128 = _mm_clmulepi64_si128(data128, two_k128, 0);
temp128 = _mm_xor_si128(temp128, data128);
temp128 = _mm_and_si128(temp128, _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF));
crc128 = _mm_clmulepi64_si128(temp128, two_k128, 0x10);
crc128 = _mm_xor_si128(crc128, temp128);
crc128 = _mm_xor_si128(crc128, data128);
crc = _mm_extract_epi32(crc128, 2);
// per 1 byte rest
while (len--)
crc = crc_table[(crc & 0xFF) ^ (*buf++)] ^ (crc >> 8);
return crc;
}
// 内容が全て 0 のデータの CRC-32 を更新する
unsigned int crc_update_zero(unsigned int crc, unsigned int len)
{
__m128i crc128, data128, temp128, two_k128;
// special case; shorter than 16 bytes
if (((cpu_flag & 8) == 0) || (len < 16)){
while (len--)
crc = crc_table[crc & 0xFF] ^ (crc >> 8);
return crc;
}
// first 16 bytes
len -= 16;
crc128 = _mm_cvtsi32_si128(crc); // set initial value
// set two constants; K1 = 0xccaa009e, K2 = 0x1751997d0
two_k128 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e);
// per 16 bytes
while (len >= 16){
temp128 = _mm_clmulepi64_si128(crc128, two_k128, 0x10);
crc128 = _mm_clmulepi64_si128(crc128, two_k128, 0x01);
crc128 = _mm_xor_si128(crc128, temp128);
len -= 16;
}
// set two constants; K5 = 0xccaa009e, K6 = 0x163cd6124
two_k128 = _mm_set_epi32(0x00000001, 0x63cd6124, 0x00000000, 0xccaa009e);
// reduce from 128-bit to 96-bit by multiplication with K5
data128 = _mm_clmulepi64_si128(crc128, two_k128, 0);
temp128 = _mm_srli_si128(crc128, 8);
data128 = _mm_xor_si128(data128, temp128);
// reduce from 96-bit to 64-bit by multiplication with K6
temp128 = _mm_slli_si128(data128, 4);
crc128 = _mm_clmulepi64_si128(temp128, two_k128, 0x10);
crc128 = _mm_xor_si128(crc128, data128);
// set two constants; K7 = 0x1f7011640, K8 = 0x1db710640
two_k128 = _mm_set_epi32(0x00000001, 0xdb710640, 0x00000001, 0xf7011640);
// Barrett Reduction from 64-bit to 32-bit
data128 = _mm_and_si128(crc128, _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000));
temp128 = _mm_clmulepi64_si128(data128, two_k128, 0);
temp128 = _mm_xor_si128(temp128, data128);
temp128 = _mm_and_si128(temp128, _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF));
crc128 = _mm_clmulepi64_si128(temp128, two_k128, 0x10);
crc128 = _mm_xor_si128(crc128, temp128);
crc128 = _mm_xor_si128(crc128, data128);
crc = _mm_extract_epi32(crc128, 2);
// per 1 byte rest
while (len--)
crc = crc_table[crc & 0xFF] ^ (crc >> 8);
return crc;
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// par2cmdline を参考にした関数
// window サイズの CRC を計算してある所に、1バイトずつ追加と削除をして、CRC を更新する
// This file is part of par2cmdline (a PAR 2.0 compatible file verification and
// repair tool). See http://parchive.sourceforge.net for details of PAR 2.0.
//
// Copyright (c) 2003 Peter Brian Clements
//
// par2cmdline is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
unsigned int window_table[256]; // 詳細検査で CRC-32 をスライドさせる為のテーブル
unsigned int window_mask = 0;
// 先に window_mask を計算してソース・ブロックのチェックサムに XOR することで、
// 初期値と最終処理の 0xFFFFFFFF の影響を消す。
// そうすることで、スライド時に window_mask を使わないで済む。
// Slide the CRC along a buffer by one character (removing the old and adding the new).
// The new character is added using the main CCITT CRC32 table, and the old character
// is removed using the windowtable.
/*
unsigned int crc_slide_char(unsigned int crc, unsigned char chNew, unsigned char chOld){
return crc_table[(crc & 0xFF) ^ chNew] ^ (crc >> 8) ^ window_table[chOld];
}
*/
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// Jonathan Camacho が改良した計算方法
// CRC( XOR(A,B) ) == XOR(CRC(A), CRC(B))
// bit twiddling to construct windowtable and windowmask
// window_table と window_mask の計算が 257倍? 速くなるらしい。
static void compute_result_table(unsigned int result, unsigned int result_array[32])
{
int i;
for (i = 0; i < 32; i++){
result = ((result >> 1) & 0x7FFFFFFFL) ^ ((result & 1) ? CRC32_POLY : 0);
result_array[i] = result;
}
}
static void fast_compute_crc_table(unsigned int result_array[8], unsigned int table[256])
{
int i, j, value;
unsigned int new_crc;
table[0] = 0; // g_CrcTable[0 & 0xff] ^ (0 >> 8) は常に 0
for (i = 1; i < 256; i++){
//Firstly, find the correct masks that we need.
//result_array[0] is 128,
//result_array[1] is 64
//result_array[2] is 32
//result_array[3] is 16
//result_array[4] is 8
//result_array[5] is 4
//result_array[6] is 2
//result_array[7] is 1.
//The other values in result_array are not needed right now.
//So basically, for all values of i 0..255 we need to xor
//together the result_array values that it represents.
new_crc = 0;
value = i;
for (j = 0; j < 8; j++){
new_crc = new_crc ^ ( (value & 128) ? result_array[j] : 0) ;
value = value << 1;
}
table[i] = new_crc;
// printf("table[%d] = %08x\n", i, table[i]);
}
}
/*
unsigned int onepass_window_gen(unsigned int window_size, unsigned int window_table[256])
{
unsigned int result = 1;
unsigned int i;
unsigned int masked_result_array[32];
unsigned int window_mask;
if (window_size > 4){
result = crc_update_zero(result, window_size - 4);
compute_result_table(result, masked_result_array);
window_mask = 0;
for (i = 0; i < 32; i++){
window_mask = window_mask ^ masked_result_array[i];
}
window_mask = window_mask ^ ~0;
for (i = 0; i < 4; i++){
result = crc_table[(result & 0xFF)] ^ (result >> 8);
}
compute_result_table(result, masked_result_array);
fast_compute_crc_table(masked_result_array, window_table);
} else { // 普通? に計算する
window_table[0] = 0; // crc_table[0 & 0xff] ^ (0 >> 8) は常に 0
for (i = 1; i < 256; i++){
window_table[i] = crc_update_zero(crc_table[i], window_size); // 0が window サイズ個並んだデータの CRC
}
window_mask = crc_update_zero(0xFFFFFFFF, window_size);
window_mask ^= 0xFFFFFFFF;
}
// printf("window_mask = %08x\n", window_mask);
// for (i = 0; i < 256; i++)
// printf("window_table[%d] = %08x\n", i, window_table[i]);
return window_mask;
}
*/
void onepass_window_gen(unsigned int window_size)
{
unsigned int result = 1;
unsigned int i;
unsigned int masked_result_array[32];
if (window_size <= 4){
window_mask = 0x2144DF1C; // 4バイトの 0 に対する CRC-32
return; // ブロック・サイズが 4以下の時はスライド検査しない
}
result = crc_update_zero(result, window_size - 4);
compute_result_table(result, masked_result_array);
window_mask = 0;
for (i = 0; i < 32; i++){
window_mask = window_mask ^ masked_result_array[i];
}
window_mask = window_mask ^ 0xFFFFFFFF;
for (i = 0; i < 4; i++){
result = crc_table[(result & 0xFF)] ^ (result >> 8);
}
compute_result_table(result, masked_result_array);
fast_compute_crc_table(masked_result_array, window_table);
}
// 初期値と最終処理の 0xFFFFFFFF を使ってない CRC のスライドには window_mask は必要ない
void onepass_window_gen_short(unsigned int short_size, unsigned int short_table[256])
{
unsigned int result = 1;
unsigned int masked_result_array[32];
// short_size must be larger than 4-byte
result = crc_update_zero(result, short_size);
compute_result_table(result, masked_result_array);
fast_compute_crc_table(masked_result_array, short_table);
}