Add files via upload

This commit is contained in:
Yutaka Sawada
2023-12-26 18:53:12 +09:00
committed by GitHub
parent 79d0b184b8
commit 1552fb8ec8
13 changed files with 920 additions and 430 deletions

View File

@@ -1,4 +1,4 @@
[ par2j.exe - version 1.3.3.1 or later ] [ par2j.exe - version 1.3.3.2 or later ]
Type "par2j.exe" to see version, test integrity, and show usage below. Type "par2j.exe" to see version, test integrity, and show usage below.
@@ -369,14 +369,22 @@ The format is "/lc#" (# is from 1 to 32 as the number of using threads).
0: It uses the number of physical Cores. 0: It uses the number of physical Cores.
255: It tries to use more threads than number of physical Cores. 255: It tries to use more threads than number of physical Cores.
You may set additional combinations; You may set additional combinations for CPU feature;
+1024 to disable CLMUL (and use slower SSSE3 code), +1024 to disable CLMUL (and use slower SSSE3 code)
+2048 to disable JIT (for SSE2), +2048 to disable JIT (for SSE2)
+4096 to disable SSSE3, +4096 to disable SSSE3
+8192 to disable AVX2, +8192 to disable AVX2
+256 or +512 (slower device) to enable GPU acceleration.
for example, /lc1 to use single Core, /lc45 to use half Cores and GPU You may set additional combinations for GPU control;
+256 or +512 (slower device) to enable GPU acceleration
+65536 for classic method
+131072 for 16-byte memory access
+262144 for 4-byte memory access and calculate 2 blocks at once
+524288 for 16-byte memory access and calculate 2 blocks at once
+1048576 for CL_MEM_COPY_HOST_PTR or +2097152 for CL_MEM_USE_HOST_PTR
(When you set exclusive bits, larger value will be used.)
for example, /lc1 to use single Core, /lc508 to use half Cores and GPU
/m : /m :
Set this, if you want to set memory usage. Set this, if you want to set memory usage.

View File

@@ -1,5 +1,5 @@
// create.c // create.c
// Copyright : 2023-10-22 Yutaka Sawada // Copyright : 2023-12-12 Yutaka Sawada
// License : GPL // License : GPL
#ifndef _UNICODE #ifndef _UNICODE
@@ -26,6 +26,11 @@
//#define TIMER // 実験用 //#define TIMER // 実験用
#ifdef TIMER
#include <time.h>
static double time_sec, time_speed;
#endif
// ソート時に項目を比較する // ソート時に項目を比較する
static int sort_cmp(const void *elem1, const void *elem2) static int sort_cmp(const void *elem1, const void *elem2)
{ {
@@ -196,7 +201,7 @@ int set_common_packet(
__int64 prog_now = 0; __int64 prog_now = 0;
#ifdef TIMER #ifdef TIMER
unsigned int time_start = GetTickCount(); clock_t time_start = clock();
#endif #endif
print_progress_text(0, "Computing file hash"); print_progress_text(0, "Computing file hash");
@@ -305,14 +310,14 @@ unsigned int time_start = GetTickCount();
off += (64 + main_packet_size); off += (64 + main_packet_size);
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount() - time_start; time_start = clock() - time_start;
printf("hash %d.%03d sec", time_start / 1000, time_start % 1000); time_sec = (double)time_start / CLOCKS_PER_SEC;
if (time_start > 0){ if (time_sec > 0){
time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072)); time_speed = (double)total_file_size / (time_sec * 1048576);
printf(", %d MB/s\n", time_start);
} else { } else {
printf("\n"); time_speed = 0;
} }
printf("hash %.3f sec, %.0f MB/s\n", time_sec, time_speed);
#endif #endif
error_end: error_end:
@@ -341,7 +346,7 @@ int set_common_packet_multi(
FILE_HASH_TH th[MAX_MULTI_READ]; FILE_HASH_TH th[MAX_MULTI_READ];
#ifdef TIMER #ifdef TIMER
unsigned int time_start = GetTickCount(); clock_t time_start = clock();
#endif #endif
memset(hSub, 0, sizeof(HANDLE) * MAX_MULTI_READ); memset(hSub, 0, sizeof(HANDLE) * MAX_MULTI_READ);
@@ -545,14 +550,14 @@ unsigned int time_start = GetTickCount();
} }
print_progress_done(); // 改行して行の先頭に戻しておく print_progress_done(); // 改行して行の先頭に戻しておく
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount() - time_start; time_start = clock() - time_start;
printf("hash %d.%03d sec", time_start / 1000, time_start % 1000); time_sec = (double)time_start / CLOCKS_PER_SEC;
if (time_start > 0){ if (time_sec > 0){
time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072)); time_speed = (double)total_file_size / (time_sec * 1048576);
printf(", %d MB/s\n", time_start);
} else { } else {
printf("\n"); time_speed = 0;
} }
printf("hash %.3f sec, %.0f MB/s\n", time_sec, time_speed);
#endif #endif
error_end: error_end:
@@ -700,7 +705,7 @@ int set_common_packet_hash(
__int64 prog_now = 0; __int64 prog_now = 0;
#ifdef TIMER #ifdef TIMER
unsigned int time_start = GetTickCount(); clock_t time_start = clock();
#endif #endif
print_progress_text(0, "Computing file hash"); print_progress_text(0, "Computing file hash");
@@ -740,8 +745,8 @@ unsigned int time_start = GetTickCount();
print_progress_done(); // 改行して行の先頭に戻しておく print_progress_done(); // 改行して行の先頭に戻しておく
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount() - time_start; time_start = clock() - time_start;
printf("hash %d.%03d sec\n", time_start / 1000, time_start % 1000); printf("hash %.3f sec\n", (double)time_start / CLOCKS_PER_SEC);
#endif #endif
return 0; return 0;
} }
@@ -1065,7 +1070,7 @@ int create_recovery_file(
#endif #endif
#ifdef TIMER #ifdef TIMER
unsigned int time_start = GetTickCount(); clock_t time_start = clock();
#endif #endif
print_progress_text(0, "Constructing recovery file"); print_progress_text(0, "Constructing recovery file");
time_last = GetTickCount(); time_last = GetTickCount();
@@ -1258,8 +1263,8 @@ unsigned int time_start = GetTickCount();
print_progress_done(); // 改行して行の先頭に戻しておく print_progress_done(); // 改行して行の先頭に戻しておく
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount() - time_start; time_start = clock() - time_start;
printf("write %d.%03d sec\n", time_start / 1000, time_start % 1000); printf("write %.3f sec\n", (double)time_start / CLOCKS_PER_SEC);
#endif #endif
return 0; return 0;

View File

@@ -2795,7 +2795,7 @@ void galois_align_xor(
void galois_align16_multiply( void galois_align16_multiply(
unsigned char *r1, // Region to multiply (must be aligned by 16) unsigned char *r1, // Region to multiply (must be aligned by 16)
unsigned char *r2, // Products go here unsigned char *r2, // Products go here
unsigned int len, // Byte length (must be multiple of 32) unsigned int len, // Byte length (must be multiple of 16)
int factor) // Number to multiply by int factor) // Number to multiply by
{ {
if (factor <= 1){ if (factor <= 1){

View File

@@ -6,7 +6,7 @@ extern "C" {
#endif #endif
extern unsigned short *galois_log_table; //extern unsigned short *galois_log_table;
extern unsigned int cpu_flag; extern unsigned int cpu_flag;
int galois_create_table(void); // Returns 0 on success, -1 on failure int galois_create_table(void); // Returns 0 on success, -1 on failure

View File

@@ -1,5 +1,5 @@
// lib_opencl.c // lib_opencl.c
// Copyright : 2023-11-27 Yutaka Sawada // Copyright : 2023-12-26 Yutaka Sawada
// License : GPL // License : GPL
#ifndef _WIN32_WINNT #ifndef _WIN32_WINNT
@@ -84,7 +84,7 @@ cl_command_queue OpenCL_command = NULL;
cl_kernel OpenCL_kernel = NULL; cl_kernel OpenCL_kernel = NULL;
cl_mem OpenCL_src = NULL, OpenCL_dst = NULL, OpenCL_buf = NULL; cl_mem OpenCL_src = NULL, OpenCL_dst = NULL, OpenCL_buf = NULL;
size_t OpenCL_group_num; size_t OpenCL_group_num;
int OpenCL_method = 0; // 正=速い機器を選ぶ, 負=遅い機器を選ぶ int OpenCL_method = 0; // 標準では GPU を使わず、動作は自動選択される
API_clCreateBuffer gfn_clCreateBuffer; API_clCreateBuffer gfn_clCreateBuffer;
API_clReleaseMemObject gfn_clReleaseMemObject; API_clReleaseMemObject gfn_clReleaseMemObject;
@@ -100,7 +100,11 @@ API_clEnqueueNDRangeKernel gfn_clEnqueueNDRangeKernel;
/* /*
入力 入力
OpenCL_method : どのデバイスを選ぶか OpenCL_method : どのデバイスや関数を選ぶか
0x100 = 速い機器を選ぶ, 0x200 = 遅い機器を選ぶ
0x10000 = 1ブロックずつ計算する, 0x20000 = 2ブロックずつ計算しようとする
0x40000 = 4-byte memory access, 0x80000 = try 16-byte memory access
0x100000 = CL_MEM_COPY_HOST_PTR, 0x200000 = CL_MEM_USE_HOST_PTR
unit_size : ブロックの単位サイズ unit_size : ブロックの単位サイズ
src_max : ソース・ブロック個数 src_max : ソース・ブロック個数
@@ -111,11 +115,12 @@ OpenCL_method : 動作フラグいろいろ
*/ */
// 0=成功, 1エラー番号 // 0=成功, 1エラー番号
int init_OpenCL(int unit_size, int *src_max) int init_OpenCL(unsigned int unit_size, int *src_max)
{ {
char buf[2048], *p_source; char buf[2048], *p_source;
int err = 0, i, j; int err = 0, i, j;
int gpu_power, count, gpu_flag; int gpu_power, count;
int unified_memory; // non zero = Integrated GPU
size_t data_size, alloc_max; size_t data_size, alloc_max;
//FILE *fp; //FILE *fp;
HRSRC res; HRSRC res;
@@ -136,9 +141,10 @@ int init_OpenCL(int unit_size, int *src_max)
API_clReleaseProgram fn_clReleaseProgram; API_clReleaseProgram fn_clReleaseProgram;
API_clCreateKernel fn_clCreateKernel; API_clCreateKernel fn_clCreateKernel;
API_clGetKernelWorkGroupInfo fn_clGetKernelWorkGroupInfo; API_clGetKernelWorkGroupInfo fn_clGetKernelWorkGroupInfo;
API_clReleaseKernel fn_clReleaseKernel;
cl_int ret; cl_int ret;
cl_uint num_platforms = 0, num_devices = 0, num_groups, param_value; cl_uint num_platforms = 0, num_devices = 0, num_groups, param_value;
cl_ulong param_value8, param_value4; cl_ulong param_value8;
cl_platform_id platform_id[MAX_DEVICE], selected_platform; // Intel, AMD, Nvidia などドライバーの提供元 cl_platform_id platform_id[MAX_DEVICE], selected_platform; // Intel, AMD, Nvidia などドライバーの提供元
cl_device_id device_id[MAX_DEVICE], selected_device; // CPU や GPU など cl_device_id device_id[MAX_DEVICE], selected_device; // CPU や GPU など
cl_program program; cl_program program;
@@ -215,6 +221,9 @@ int init_OpenCL(int unit_size, int *src_max)
fn_clGetKernelWorkGroupInfo = (API_clGetKernelWorkGroupInfo)GetProcAddress(hLibOpenCL, "clGetKernelWorkGroupInfo"); fn_clGetKernelWorkGroupInfo = (API_clGetKernelWorkGroupInfo)GetProcAddress(hLibOpenCL, "clGetKernelWorkGroupInfo");
if (fn_clGetKernelWorkGroupInfo == NULL) if (fn_clGetKernelWorkGroupInfo == NULL)
return err; return err;
fn_clReleaseKernel = (API_clReleaseKernel)GetProcAddress(hLibOpenCL, "clReleaseKernel");
if (fn_clReleaseKernel == NULL)
return err;
gfn_clFinish = (API_clFinish)GetProcAddress(hLibOpenCL, "clFinish"); gfn_clFinish = (API_clFinish)GetProcAddress(hLibOpenCL, "clFinish");
if (gfn_clFinish == NULL) if (gfn_clFinish == NULL)
return err; return err;
@@ -226,12 +235,10 @@ int init_OpenCL(int unit_size, int *src_max)
ret = fn_clGetPlatformIDs(MAX_DEVICE, platform_id, &num_platforms); ret = fn_clGetPlatformIDs(MAX_DEVICE, platform_id, &num_platforms);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 10; return (ret << 8) | 10;
if (OpenCL_method >= 0){ // 選択する順序と初期値を変える if (OpenCL_method & 0x200){ // 選択する順序と初期値を変える
OpenCL_method = 1;
gpu_power = 0;
} else {
OpenCL_method = -1;
gpu_power = INT_MIN; gpu_power = INT_MIN;
} else {
gpu_power = 0;
} }
alloc_max = 0; alloc_max = 0;
@@ -268,20 +275,17 @@ int init_OpenCL(int unit_size, int *src_max)
#endif #endif
// 取得できなくてもエラーにしない // 取得できなくてもエラーにしない
param_value = 0; // CL_DEVICE_HOST_UNIFIED_MEMORY は OpenCL 2.0 以降で非推奨になった
ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_uint), &param_value, NULL); ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_uint), &param_value, NULL);
if (ret == CL_SUCCESS){
if (param_value != 0){
#ifdef DEBUG_OUTPUT #ifdef DEBUG_OUTPUT
if (ret == CL_SUCCESS)
printf("HOST_UNIFIED_MEMORY = %d\n", param_value); printf("HOST_UNIFIED_MEMORY = %d\n", param_value);
#endif #endif
if (param_value != 0)
param_value = 1; param_value = 1;
param_value4 = 0; // local memory が多い時だけ処理を変える }
ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param_value4, NULL); } else { // CL_DEVICE_HOST_UNIFIED_MEMORY は OpenCL 2.0 以降で非推奨になった
#ifdef DEBUG_OUTPUT param_value = 0;
if (ret == CL_SUCCESS) }
printf("LOCAL_MEM_SIZE = %I64d KB\n", param_value4 >> 10);
#endif
// 取得できない場合はエラーにする // 取得できない場合はエラーにする
ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &param_value8, NULL); ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &param_value8, NULL);
@@ -302,7 +306,8 @@ int init_OpenCL(int unit_size, int *src_max)
#endif #endif
// MAX_COMPUTE_UNITS * MAX_WORK_GROUP_SIZE で計算力を測る、外付けGPUなら値を倍にする // MAX_COMPUTE_UNITS * MAX_WORK_GROUP_SIZE で計算力を測る、外付けGPUなら値を倍にする
count = (2 - param_value) * (int)data_size * num_groups; count = (2 - param_value) * (int)data_size * num_groups;
count *= OpenCL_method; // 符号を変える if (OpenCL_method & 0x200) // Prefer slower device
count *= -1; // 符号を変える
//printf("prev = %d, now = %d\n", gpu_power, count); //printf("prev = %d, now = %d\n", gpu_power, count);
if ((count > gpu_power) && (data_size >= 256) && // 256以上ないとテーブルを作れない if ((count > gpu_power) && (data_size >= 256) && // 256以上ないとテーブルを作れない
(param_value8 / 8 > (cl_ulong)unit_size)){ // CL_DEVICE_MAX_MEM_ALLOC_SIZE に収まるか (param_value8 / 8 > (cl_ulong)unit_size)){ // CL_DEVICE_MAX_MEM_ALLOC_SIZE に収まるか
@@ -311,9 +316,7 @@ int init_OpenCL(int unit_size, int *src_max)
selected_platform = platform_id[i]; selected_platform = platform_id[i];
OpenCL_group_num = num_groups; // ワークグループ数は COMPUTE_UNITS 数にする OpenCL_group_num = num_groups; // ワークグループ数は COMPUTE_UNITS 数にする
alloc_max = (size_t)param_value8; alloc_max = (size_t)param_value8;
gpu_flag = param_value; // 0 = discrete GPU, 1 = integrated GPU unified_memory = param_value; // 0 = discrete GPU, 1 = integrated GPU
if (param_value4 >= 32768)
gpu_flag |= 2; // local memory が 32KB 以上あるかどうか
// AMD や Intel の GPU ではメモリー領域が全体の 1/4 とは限らない // AMD や Intel の GPU ではメモリー領域が全体の 1/4 とは限らない
ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &param_value8, NULL); ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &param_value8, NULL);
@@ -355,49 +358,6 @@ int init_OpenCL(int unit_size, int *src_max)
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 12; return (ret << 8) | 12;
// 計算方式を選択する
if ((((cpu_flag & 0x101) == 1) || ((cpu_flag & 0x110) == 0x10)) && (sse_unit == 32)){
if (gpu_flag & 2){
OpenCL_method = 3; // local memory が 32KB 以上あれば 16-byte ずつアクセスする
} else {
OpenCL_method = 2; // SSSE3 & ALTMAP または AVX2 ならデータの並び替え対応版を使う
}
} else if (((cpu_flag & 128) != 0) && (sse_unit == 256)){
OpenCL_method = 4; // JIT(SSE2) は bit ごとに上位から 16バイトずつ並ぶ
// ローカルのテーブルサイズが異なることに注意
// XOR 方式以外は 2KB (4バイト * 256項目 * 2個) 使う
// XOR (JIT) は 64バイト (4バイト * 16項目) 使う
} else {
OpenCL_method = 1; // 並び替えられてないデータ用
}
// work group 数が必要以上に多い場合は減らす
if (OpenCL_method == 2){
// work item 一個が 8バイトずつ計算する、256個なら work group ごとに 2KB 担当する
data_size = unit_size / 2048;
} else if (OpenCL_method == 3){
// work item 一個が 32バイトずつ計算する、256個なら work group ごとに 8KB 担当する
data_size = unit_size / 8192;
} else {
// work item 一個が 4バイトずつ計算する、256個なら work group ごとに 1KB 担当する
data_size = unit_size / 1024;
}
if (OpenCL_group_num > data_size){
OpenCL_group_num = data_size;
printf("Number of work groups is reduced to %zd\n", OpenCL_group_num);
}
// データへのアクセス方法をデバイスによって変える
if (gpu_flag & 1){
OpenCL_method |= 8; // Integrated GPU なら CL_MEM_USE_HOST_PTR を使う
} else { // Discrete GPU なら NVIDIA のだけ flag を変える
ret = fn_clGetDeviceInfo(selected_device, CL_DEVICE_VERSION, sizeof(buf), buf, NULL);
if (ret == CL_SUCCESS){
if (strstr(buf, "CUDA") != NULL)
OpenCL_method |= 8; // NVIDIA GPU なら CL_MEM_USE_HOST_PTR を使う
}
}
// 最大で何ブロック分のメモリー領域を保持できるのか(ここではまだ確保しない) // 最大で何ブロック分のメモリー領域を保持できるのか(ここではまだ確保しない)
// 後で実際に確保する量はこれよりも少なくなる // 後で実際に確保する量はこれよりも少なくなる
count = (int)(alloc_max / unit_size); // 確保できるメモリー量から逆算する count = (int)(alloc_max / unit_size); // 確保できるメモリー量から逆算する
@@ -409,25 +369,6 @@ int init_OpenCL(int unit_size, int *src_max)
printf("src buf : %zd KB (%d blocks), possible\n", data_size >> 10, count); printf("src buf : %zd KB (%d blocks), possible\n", data_size >> 10, count);
#endif #endif
// 出力先は1ブロック分だけあればいい
// CL_MEM_ALLOC_HOST_PTRを使えばpinned memoryになるらしい
data_size = unit_size;
OpenCL_dst = gfn_clCreateBuffer(OpenCL_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, data_size, NULL, &ret);
if (ret != CL_SUCCESS)
return (ret << 8) | 13;
#ifdef DEBUG_OUTPUT
printf("dst buf : %zd KB (%zd Bytes), OK\n", data_size >> 10, data_size);
#endif
// factor は最大個数分 (src_max個)
data_size = sizeof(unsigned short) * (*src_max);
OpenCL_buf = gfn_clCreateBuffer(OpenCL_context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
if (ret != CL_SUCCESS)
return (ret << 8) | 14;
#ifdef DEBUG_OUTPUT
printf("factor buf : %zd Bytes (%d factors), OK\n", data_size, (*src_max));
#endif
/* /*
// テキスト形式の OpenCL C ソース・コードを読み込む // テキスト形式の OpenCL C ソース・コードを読み込む
err = 4; err = 4;
@@ -528,17 +469,207 @@ int init_OpenCL(int unit_size, int *src_max)
return (ret << 8) | 21; return (ret << 8) | 21;
} }
// 計算方式を選択する
if ((((cpu_flag & 0x101) == 1) || ((cpu_flag & 0x110) == 0x10)) && (sse_unit == 32)){
int select_method; // SSSE3 & ALTMAP または AVX2 ならデータの並び替え対応版を使う
if (OpenCL_method & 0x80000){ // 16-byte and 2 blocks
select_method = 12;
} else if (OpenCL_method & 0x40000){ // 4-byte and 2 blocks
select_method = 10;
} else if (OpenCL_method & 0x20000){ // 16-byte
select_method = 4;
} else if (OpenCL_method & 0x10000){ // 4-byte
select_method = 2;
} else { // kernel を作って詳細を確かめる
size_t item2, item4, item10, item12;
cl_kernel kernel2, kernel4, kernel10, kernel12;
item2 = item4 = item10 = item12 = 0;
// まずは一番重くて速い奴を調べる
wsprintfA(buf, "method%d", 12);
kernel12 = fn_clCreateKernel(program, buf, &ret);
if (ret == CL_SUCCESS){
ret = fn_clGetKernelWorkGroupInfo(kernel12, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item12, NULL);
if (ret == CL_SUCCESS){
#ifdef DEBUG_OUTPUT
printf("\nTesting %s\n", buf);
printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item12);
#endif
}
}
if (item12 >= 32){ // 32以上あれば余裕で動くとみなす
select_method = 12;
OpenCL_kernel = kernel12;
#ifdef DEBUG_OUTPUT
printf("\nSelected method%d\n", select_method);
#endif
} else { // 他の奴と比較する
wsprintfA(buf, "method%d", 2);
kernel2 = fn_clCreateKernel(program, buf, &ret);
if (ret == CL_SUCCESS){
ret = fn_clGetKernelWorkGroupInfo(kernel2, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item2, NULL);
if (ret == CL_SUCCESS){
#ifdef DEBUG_OUTPUT
printf("\nTesting %s\n", buf);
printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item2);
#endif
}
}
if (item12 >= item2){
select_method = 12;
OpenCL_kernel = kernel12;
ret = fn_clReleaseKernel(kernel2);
#ifdef DEBUG_OUTPUT
if (ret != CL_SUCCESS)
printf("clReleaseKernel : Failed\n");
printf("\nSelected method%d\n", select_method);
#endif
} else {
ret = fn_clReleaseKernel(kernel12);
#ifdef DEBUG_OUTPUT
if (ret != CL_SUCCESS)
printf("clReleaseKernel : Failed\n");
#endif
wsprintfA(buf, "method%d", 10);
kernel10 = fn_clCreateKernel(program, buf, &ret);
if (ret == CL_SUCCESS){
ret = fn_clGetKernelWorkGroupInfo(kernel10, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item10, NULL);
if (ret == CL_SUCCESS){
#ifdef DEBUG_OUTPUT
printf("\nTesting %s\n", buf);
printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item10);
#endif
}
}
if (item10 >= item2){
select_method = 10;
OpenCL_kernel = kernel10;
ret = fn_clReleaseKernel(kernel2);
#ifdef DEBUG_OUTPUT
if (ret != CL_SUCCESS)
printf("clReleaseKernel : Failed\n");
printf("\nSelected method%d\n", select_method);
#endif
} else {
wsprintfA(buf, "method%d", 4);
kernel4 = fn_clCreateKernel(program, buf, &ret);
if (ret == CL_SUCCESS){
ret = fn_clGetKernelWorkGroupInfo(kernel4, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item4, NULL);
if (ret == CL_SUCCESS){
#ifdef DEBUG_OUTPUT
printf("\nTesting %s\n", buf);
printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item4);
#endif
}
}
if (item4 >= item2){
select_method = 4;
OpenCL_kernel = kernel4;
ret = fn_clReleaseKernel(kernel2);
#ifdef DEBUG_OUTPUT
if (ret != CL_SUCCESS)
printf("clReleaseKernel : Failed\n");
printf("\nSelected method%d\n", select_method);
#endif
} else {
select_method = 2;
OpenCL_kernel = kernel2;
ret = fn_clReleaseKernel(kernel4);
#ifdef DEBUG_OUTPUT
if (ret != CL_SUCCESS)
printf("clReleaseKernel : Failed\n");
printf("\nSelected method%d\n", select_method);
#endif
}
}
}
}
}
OpenCL_method |= select_method;
} else if (((cpu_flag & 128) != 0) && (sse_unit == 256)){
OpenCL_method |= 16; // JIT(SSE2) は bit ごとに上位から 16バイトずつ並ぶ
// ローカルのテーブルサイズが異なることに注意
// XOR 方式以外は 2KB (4バイト * 256項目 * 2個) 使う
// XOR (JIT) は 64バイト (4バイト * 16項目) 使う
} else {
int select_method; // 並び替えられてないデータ用
if (OpenCL_method & 0x40000){ // 4-byte and 2 blocks
select_method = 9;
} else if (OpenCL_method & 0x10000){ // 4-byte
select_method = 1;
} else { // kernel を作って詳細を確かめる
size_t item1, item9;
cl_kernel kernel1, kernel9;
item1 = item9 = 0;
// まずは一番重くて速い奴を調べる
wsprintfA(buf, "method%d", 9);
kernel9 = fn_clCreateKernel(program, buf, &ret);
if (ret == CL_SUCCESS){
ret = fn_clGetKernelWorkGroupInfo(kernel9, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item9, NULL);
if (ret == CL_SUCCESS){
#ifdef DEBUG_OUTPUT
printf("\nTesting %s\n", buf);
printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item9);
#endif
}
}
if (item9 >= 32){ // 32以上あれば余裕で動くとみなす
select_method = 9;
OpenCL_kernel = kernel9;
#ifdef DEBUG_OUTPUT
printf("\nSelected method%d\n", select_method);
#endif
} else { // 他の奴と比較する
wsprintfA(buf, "method%d", 1);
kernel1 = fn_clCreateKernel(program, buf, &ret);
if (ret == CL_SUCCESS){
ret = fn_clGetKernelWorkGroupInfo(kernel1, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item1, NULL);
if (ret == CL_SUCCESS){
#ifdef DEBUG_OUTPUT
printf("\nTesting %s\n", buf);
printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item1);
#endif
}
}
if (item9 >= item1){
select_method = 9;
OpenCL_kernel = kernel9;
ret = fn_clReleaseKernel(kernel1);
#ifdef DEBUG_OUTPUT
if (ret != CL_SUCCESS)
printf("clReleaseKernel : Failed\n");
printf("\nSelected method%d\n", select_method);
#endif
} else {
select_method = 1;
OpenCL_kernel = kernel1;
ret = fn_clReleaseKernel(kernel9);
#ifdef DEBUG_OUTPUT
if (ret != CL_SUCCESS)
printf("clReleaseKernel : Failed\n");
printf("\nSelected method%d\n", select_method);
#endif
}
}
}
OpenCL_method |= select_method;
}
// カーネル関数を抽出する // カーネル関数を抽出する
wsprintfA(buf, "method%d", OpenCL_method & 7); if (OpenCL_kernel == NULL){
wsprintfA(buf, "method%d", OpenCL_method & 31);
OpenCL_kernel = fn_clCreateKernel(program, buf, &ret); OpenCL_kernel = fn_clCreateKernel(program, buf, &ret);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 22; return (ret << 8) | 22;
#ifdef DEBUG_OUTPUT #ifdef DEBUG_OUTPUT
printf("CreateKernel : %s\n", buf); printf("CreateKernel : %s\n", buf);
ret = fn_clGetKernelWorkGroupInfo(OpenCL_kernel, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &data_size, NULL);
if (ret == CL_SUCCESS)
printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", data_size);
#endif #endif
}
// カーネルが実行できる work item 数を調べる // カーネルが実行できる work item 数を調べる
ret = fn_clGetKernelWorkGroupInfo(OpenCL_kernel, NULL, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &data_size, NULL); ret = fn_clGetKernelWorkGroupInfo(OpenCL_kernel, selected_device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &data_size, NULL);
if ((ret == CL_SUCCESS) && (data_size < 256)){ // 最低でも 256 以上は必要 if ((ret == CL_SUCCESS) && (data_size < 256)){ // 最低でも 256 以上は必要
#ifdef DEBUG_OUTPUT #ifdef DEBUG_OUTPUT
printf("KERNEL_WORK_GROUP_SIZE = %zd\n", data_size); printf("KERNEL_WORK_GROUP_SIZE = %zd\n", data_size);
@@ -558,6 +689,60 @@ int init_OpenCL(int unit_size, int *src_max)
fn_clUnloadCompiler(); fn_clUnloadCompiler();
} }
// work group 数が必要以上に多い場合は減らす
if (OpenCL_method & 4){
// work item 一個が 32バイトずつ計算する、256個なら work group ごとに 8KB 担当する
data_size = unit_size / 8192;
} else if (OpenCL_method & 2){
// work item 一個が 8バイトずつ計算する、256個なら work group ごとに 2KB 担当する
data_size = unit_size / 2048;
} else {
// work item 一個が 4バイトずつ計算する、256個なら work group ごとに 1KB 担当する
data_size = unit_size / 1024;
}
if (OpenCL_group_num > data_size){
OpenCL_group_num = data_size;
printf("Number of work groups is reduced to %zd\n", OpenCL_group_num);
}
// データへのアクセス方法をデバイスによって変える
if (OpenCL_method & 0x200000){
OpenCL_method |= 32;
} else if ((OpenCL_method & 0x100000) == 0){
if (unified_memory){
OpenCL_method |= 32; // Integrated GPU なら CL_MEM_USE_HOST_PTR を使う
} else { // Discrete GPU でも Nvidia のは動作を変える
ret = fn_clGetDeviceInfo(selected_device, CL_DEVICE_VERSION, sizeof(buf), buf, NULL);
if (ret == CL_SUCCESS){
if (strstr(buf, "CUDA") != NULL)
OpenCL_method |= 32; // NVIDIA GPU なら CL_MEM_USE_HOST_PTR を使う
}
}
}
// 出力先は1ブロック分だけあればいい
// CL_MEM_ALLOC_HOST_PTRを使えばpinned memoryになるらしい
data_size = unit_size;
if (OpenCL_method & 8)
data_size *= 2; // 2ブロックずつ計算できるように、2倍確保しておく
OpenCL_dst = gfn_clCreateBuffer(OpenCL_context, CL_MEM_ALLOC_HOST_PTR, data_size, NULL, &ret);
if (ret != CL_SUCCESS)
return (ret << 8) | 13;
#ifdef DEBUG_OUTPUT
printf("dst buf : %zd KB (%zd Bytes), OK\n", data_size >> 10, data_size);
#endif
// factor は最大個数分 (src_max個)
data_size = sizeof(unsigned short) * (*src_max);
if (OpenCL_method & 8)
data_size *= 2; // 2ブロックずつ計算できるように、2倍確保しておく
OpenCL_buf = gfn_clCreateBuffer(OpenCL_context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
if (ret != CL_SUCCESS)
return (ret << 8) | 14;
#ifdef DEBUG_OUTPUT
printf("factor buf : %zd Bytes (%d factors), OK\n", data_size, (*src_max));
#endif
// カーネル引数を指定する // カーネル引数を指定する
ret = gfn_clSetKernelArg(OpenCL_kernel, 1, sizeof(cl_mem), &OpenCL_dst); ret = gfn_clSetKernelArg(OpenCL_kernel, 1, sizeof(cl_mem), &OpenCL_dst);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
@@ -565,13 +750,12 @@ int init_OpenCL(int unit_size, int *src_max)
ret = gfn_clSetKernelArg(OpenCL_kernel, 2, sizeof(cl_mem), &OpenCL_buf); ret = gfn_clSetKernelArg(OpenCL_kernel, 2, sizeof(cl_mem), &OpenCL_buf);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 102; return (ret << 8) | 102;
if (ret != CL_SUCCESS)
return (ret << 8) | 103;
#ifdef DEBUG_OUTPUT #ifdef DEBUG_OUTPUT
// ワークアイテム数 // ワークアイテム数
printf("\nMax number of work items = %zd (256 * %zd)\n", OpenCL_group_num * 256, OpenCL_group_num); printf("\nMax number of work items = %zd (256 * %zd)\n", OpenCL_group_num * 256, OpenCL_group_num);
#endif #endif
OpenCL_method &= 0xFF; // 最後に選択設定を消去する
return 0; return 0;
} }
@@ -683,7 +867,7 @@ void info_OpenCL(char *buf, int buf_size)
// ソース・ブロックをデバイス側にコピーする // ソース・ブロックをデバイス側にコピーする
int gpu_copy_blocks( int gpu_copy_blocks(
unsigned char *data, // ( 4096) unsigned char *data, // ( 4096)
int unit_size, // 4096の倍数にすること unsigned int unit_size, // 4096の倍数にすること
int src_num) // 何ブロックをコピーするのか int src_num) // 何ブロックをコピーするのか
{ {
size_t data_size; size_t data_size;
@@ -692,7 +876,7 @@ int gpu_copy_blocks(
// Integrated GPU と Discrete GPU の違いに関係なく、使う分だけ毎回メモリー領域を確保する // Integrated GPU と Discrete GPU の違いに関係なく、使う分だけ毎回メモリー領域を確保する
data_size = (size_t)unit_size * src_num; data_size = (size_t)unit_size * src_num;
if (OpenCL_method & 8){ // AMD's APU や Integrated GPU なら ZeroCopy する if (OpenCL_method & 32){ // AMD's APU や Integrated GPU なら ZeroCopy する
// 実際に比較してみると GeForce GPU でもメモリー消費量が少なくてコピーが速い // 実際に比較してみると GeForce GPU でもメモリー消費量が少なくてコピーが速い
// NVIDIA GPU は CL_MEM_USE_HOST_PTR でも VRAM 上にキャッシュするので速いらしい // NVIDIA GPU は CL_MEM_USE_HOST_PTR でも VRAM 上にキャッシュするので速いらしい
flags = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR; flags = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR;
@@ -719,17 +903,31 @@ int gpu_copy_blocks(
int gpu_multiply_blocks( int gpu_multiply_blocks(
int src_num, // Number of multiplying source blocks int src_num, // Number of multiplying source blocks
unsigned short *mat, // Matrix of numbers to multiply by unsigned short *mat, // Matrix of numbers to multiply by
unsigned short *mat2, // Set to calculate 2 blocks at once
unsigned char *buf, // Products go here unsigned char *buf, // Products go here
int len) // Byte length unsigned int len) // Byte length
{ {
unsigned __int64 *vram, *src, *dst; unsigned __int64 *vram, *src, *dst;
size_t global_size, local_size; size_t global_size, local_size;
cl_int ret; cl_int ret;
// 倍率の配列をデバイス側に書き込む // 倍率の配列をデバイス側に書き込む
if (mat2 == NULL){ // 1ブロック分だけコピーする
ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, sizeof(short) * src_num, mat, 0, NULL, NULL); ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, sizeof(short) * src_num, mat, 0, NULL, NULL);
} else { // 2ブロックずつ計算する場合は、配列のサイズも倍になる
if ((size_t)mat2 == 1){ // アドレスが 1 になることはあり得ないので、識別できる
ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, sizeof(short) * src_num * 2, mat, 0, NULL, NULL);
} else { // 2回コピーする
size_t data_size = sizeof(short) * src_num;
ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, data_size, mat, 0, NULL, NULL);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 10; return (ret << 8) | 10;
// もう一つの配列は違う場所からコピーする
ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, data_size, data_size, mat2, 0, NULL, NULL);
}
}
if (ret != CL_SUCCESS)
return (ret << 8) | 11;
// 引数を指定する // 引数を指定する
ret = gfn_clSetKernelArg(OpenCL_kernel, 3, sizeof(int), &src_num); ret = gfn_clSetKernelArg(OpenCL_kernel, 3, sizeof(int), &src_num);
@@ -739,15 +937,15 @@ int gpu_multiply_blocks(
// カーネル並列実行 // カーネル並列実行
local_size = 256; // テーブルやキャッシュのため、work item 数は 256 に固定する local_size = 256; // テーブルやキャッシュのため、work item 数は 256 に固定する
global_size = OpenCL_group_num * 256; global_size = OpenCL_group_num * 256;
//printf("group num = %d, global size = %d, local size = 256 \n", OpenCL_group_num, global_size); //printf("group num = %d, global size = %d, local size = %d \n", OpenCL_group_num, global_size, local_size);
ret = gfn_clEnqueueNDRangeKernel(OpenCL_command, OpenCL_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); ret = gfn_clEnqueueNDRangeKernel(OpenCL_command, OpenCL_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 11; return (ret << 8) | 12;
// 出力内容をホスト側に反映させる // 出力内容をホスト側に反映させる
vram = gfn_clEnqueueMapBuffer(OpenCL_command, OpenCL_dst, CL_TRUE, CL_MAP_READ, 0, len, 0, NULL, NULL, &ret); vram = gfn_clEnqueueMapBuffer(OpenCL_command, OpenCL_dst, CL_TRUE, CL_MAP_READ, 0, len, 0, NULL, NULL, &ret);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 12; return (ret << 8) | 13;
// 8バイトごとに XOR する (SSE2 で XOR しても速くならず) // 8バイトごとに XOR する (SSE2 で XOR しても速くならず)
src = vram; src = vram;
@@ -762,7 +960,7 @@ int gpu_multiply_blocks(
// ホスト側でデータを変更しなくても、clEnqueueMapBufferと対で呼び出さないといけない // ホスト側でデータを変更しなくても、clEnqueueMapBufferと対で呼び出さないといけない
ret = gfn_clEnqueueUnmapMemObject(OpenCL_command, OpenCL_dst, vram, 0, NULL, NULL); ret = gfn_clEnqueueUnmapMemObject(OpenCL_command, OpenCL_dst, vram, 0, NULL, NULL);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 13; return (ret << 8) | 14;
return 0; return 0;
} }
@@ -775,12 +973,12 @@ int gpu_finish(void)
// 全ての処理が終わるのを待つ // 全ての処理が終わるのを待つ
ret = gfn_clFinish(OpenCL_command); ret = gfn_clFinish(OpenCL_command);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 20; return (ret << 8) | 30;
if (OpenCL_src != NULL){ // 確保されてる場合は解除する if (OpenCL_src != NULL){ // 確保されてる場合は解除する
ret = gfn_clReleaseMemObject(OpenCL_src); ret = gfn_clReleaseMemObject(OpenCL_src);
if (ret != CL_SUCCESS) if (ret != CL_SUCCESS)
return (ret << 8) | 21; return (ret << 8) | 31;
OpenCL_src = NULL; OpenCL_src = NULL;
} }

View File

@@ -10,20 +10,21 @@ extern "C" {
extern int OpenCL_method; extern int OpenCL_method;
int init_OpenCL(int unit_size, int *src_max); int init_OpenCL(unsigned int unit_size, int *src_max);
int free_OpenCL(void); int free_OpenCL(void);
void info_OpenCL(char *buf, int buf_size); void info_OpenCL(char *buf, int buf_size);
int gpu_copy_blocks( int gpu_copy_blocks(
unsigned char *data, unsigned char *data,
int unit_size, unsigned int unit_size,
int src_num); int src_num);
int gpu_multiply_blocks( int gpu_multiply_blocks(
int src_num, // Number of multiplying source blocks int src_num, // Number of multiplying source blocks
unsigned short *mat, // Matrix of numbers to multiply by unsigned short *mat, // Matrix of numbers to multiply by
unsigned short *mat2, // Set to calculate 2 blocks at once
unsigned char *buf, // Products go here unsigned char *buf, // Products go here
int len); // Byte length unsigned int len); // Byte length
int gpu_finish(void); int gpu_finish(void);

View File

@@ -1,5 +1,5 @@
// list.c // list.c
// Copyright : 2023-10-15 Yutaka Sawada // Copyright : 2023-12-12 Yutaka Sawada
// License : GPL // License : GPL
#ifndef _UNICODE #ifndef _UNICODE
@@ -26,6 +26,11 @@
//#define TIMER // 実験用 //#define TIMER // 実験用
#ifdef TIMER
#include <time.h>
static double time_sec, time_speed;
#endif
// recovery set のファイルのハッシュ値を調べる (空のファイルは除く) // recovery set のファイルのハッシュ値を調べる (空のファイルは除く)
// 0x00 = ファイルが存在して完全である // 0x00 = ファイルが存在して完全である
// 0x01 = ファイルが存在しない // 0x01 = ファイルが存在しない
@@ -296,7 +301,7 @@ int check_file_complete(
{ {
int i, rv; int i, rv;
#ifdef TIMER #ifdef TIMER
unsigned int time_start = GetTickCount(); clock_t time_start = clock();
#endif #endif
printf("\nVerifying Input File :\n"); printf("\nVerifying Input File :\n");
@@ -332,14 +337,14 @@ unsigned int time_start = GetTickCount();
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount() - time_start; time_start = clock() - time_start;
printf("\n hash %d.%03d sec", time_start / 1000, time_start % 1000); time_sec = (double)time_start / CLOCKS_PER_SEC;
if (time_start > 0){ if (time_sec > 0){
time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072)); time_speed = (double)total_file_size / (time_sec * 1048576);
printf(", %d MB/s\n", time_start);
} else { } else {
printf("\n"); time_speed = 0;
} }
printf("\n hash %.3f sec, %.0f MB/s\n", time_sec, time_speed);
#endif #endif
return 0; return 0;
} }
@@ -364,7 +369,7 @@ int check_file_complete_multi(
HANDLE hSub[MAX_READ_NUM]; HANDLE hSub[MAX_READ_NUM];
FILE_CHECK_TH th[MAX_READ_NUM]; FILE_CHECK_TH th[MAX_READ_NUM];
#ifdef TIMER #ifdef TIMER
unsigned int time_start = GetTickCount(); clock_t time_start = clock();
#endif #endif
memset(hSub, 0, sizeof(HANDLE) * MAX_READ_NUM); memset(hSub, 0, sizeof(HANDLE) * MAX_READ_NUM);
@@ -630,14 +635,14 @@ unsigned int time_start = GetTickCount();
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount() - time_start; time_start = clock() - time_start;
printf("\n hash %d.%03d sec", time_start / 1000, time_start % 1000); time_sec = (double)time_start / CLOCKS_PER_SEC;
if (time_start > 0){ if (time_sec > 0){
time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072)); time_speed = (double)total_file_size / (time_sec * 1048576);
printf(", %d MB/s\n", time_start);
} else { } else {
printf("\n"); time_speed = 0;
} }
printf("\n hash %.3f sec, %.0f MB/s\n", time_sec, time_speed);
#endif #endif
error_end: error_end:

View File

@@ -1,5 +1,5 @@
// md5_crc.c // md5_crc.c
// Copyright : 2023-10-29 Yutaka Sawada // Copyright : 2023-12-12 Yutaka Sawada
// License : GPL // License : GPL
#ifndef _UNICODE #ifndef _UNICODE
@@ -21,7 +21,6 @@
#include "phmd5.h" #include "phmd5.h"
#include "md5_crc.h" #include "md5_crc.h"
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// バイト配列の MD5 ハッシュ値を求める // バイト配列の MD5 ハッシュ値を求める
@@ -200,8 +199,10 @@ int file_md5_crc32_block(
//#define TIMER // 実験用 //#define TIMER // 実験用
#ifdef TIMER #ifdef TIMER
static unsigned int time_start, time1_start; #include <time.h>
static unsigned int time_total = 0, time2_total = 0, time3_total = 0; static double time_sec, time_speed;
static clock_t time_start, time1_start;
static clock_t time_total = 0, time2_total = 0, time3_total = 0;
#endif #endif
#define MAX_BUF_SIZE 2097152 // ヒープ領域を使う場合の最大サイズ #define MAX_BUF_SIZE 2097152 // ヒープ領域を使う場合の最大サイズ
@@ -224,7 +225,7 @@ int file_hash_crc(
HANDLE hFile; HANDLE hFile;
OVERLAPPED ol; OVERLAPPED ol;
#ifdef TIMER #ifdef TIMER
time1_start = GetTickCount(); time1_start = clock();
#endif #endif
// ソース・ファイルを開く // ソース・ファイルを開く
@@ -251,11 +252,11 @@ time1_start = GetTickCount();
if (file_left < IO_SIZE) if (file_left < IO_SIZE)
read_size = (unsigned int)file_left; read_size = (unsigned int)file_left;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf1, read_size, NULL, &ol); off = ReadFile(hFile, buf1, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -281,11 +282,11 @@ time2_total += GetTickCount() - time_start;
ol.OffsetHigh = (unsigned int)(file_off >> 32); ol.OffsetHigh = (unsigned int)(file_off >> 32);
file_off += IO_SIZE; file_off += IO_SIZE;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf, read_size, NULL, &ol); off = ReadFile(hFile, buf, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -301,7 +302,7 @@ time2_total += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = 0; // チェックサム計算 off = 0; // チェックサム計算
if (block_left > 0){ // 前回足りなかった分を追加する if (block_left > 0){ // 前回足りなかった分を追加する
@@ -338,7 +339,7 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time3_total += GetTickCount() - time_start; time3_total += clock() - time_start;
#endif #endif
// 経過表示 // 経過表示
@@ -369,16 +370,17 @@ error_end:
CloseHandle(ol.hEvent); CloseHandle(ol.hEvent);
#ifdef TIMER #ifdef TIMER
time_total += GetTickCount() - time1_start; time_total += clock() - time1_start;
if (*prog_now == total_file_size){ if (*prog_now == total_file_size){
printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
if (time_total > 0){ time_sec = (double)time_total / CLOCKS_PER_SEC;
time_start = (int)((total_file_size * 125) / ((__int64)time_total * 131072)); if (time_sec > 0){
time_speed = (double)total_file_size / (time_sec * 1048576);
} else { } else {
time_start = 0; time_speed = 0;
} }
printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start); printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
} }
#endif #endif
return err; return err;
@@ -403,7 +405,7 @@ int file_hash_crc(
HANDLE hFile; HANDLE hFile;
OVERLAPPED ol; OVERLAPPED ol;
#ifdef TIMER #ifdef TIMER
time1_start = GetTickCount(); time1_start = clock();
#endif #endif
// ソース・ファイルを開く // ソース・ファイルを開く
@@ -442,11 +444,11 @@ error_retry_read:
if (file_left < IO_SIZE) if (file_left < IO_SIZE)
read_size = (unsigned int)file_left; read_size = (unsigned int)file_left;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf1, read_size, NULL, &ol); off = ReadFile(hFile, buf1, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -536,11 +538,11 @@ error_retry_pause:
ol.OffsetHigh = (unsigned int)(file_off >> 32); ol.OffsetHigh = (unsigned int)(file_off >> 32);
file_off += IO_SIZE; file_off += IO_SIZE;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf, read_size, NULL, &ol); off = ReadFile(hFile, buf, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -557,7 +559,7 @@ time2_total += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = 0; // チェックサム計算 off = 0; // チェックサム計算
if (block_left > 0){ // 前回足りなかった分を追加する if (block_left > 0){ // 前回足りなかった分を追加する
@@ -594,7 +596,7 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time3_total += GetTickCount() - time_start; time3_total += clock() - time_start;
#endif #endif
// 経過表示 // 経過表示
@@ -625,16 +627,17 @@ error_end:
CloseHandle(ol.hEvent); CloseHandle(ol.hEvent);
#ifdef TIMER #ifdef TIMER
time_total += GetTickCount() - time1_start; time_total += clock() - time1_start;
if (*prog_now == total_file_size){ if (*prog_now == total_file_size){
printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
if (time_total > 0){ time_sec = (double)time_total / CLOCKS_PER_SEC;
time_start = (int)((total_file_size * 125) / ((__int64)time_total * 131072)); if (time_sec > 0){
time_speed = (double)total_file_size / (time_sec * 1048576);
} else { } else {
time_start = 0; time_speed = 0;
} }
printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start); printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
} }
#endif #endif
return err; return err;
@@ -660,7 +663,7 @@ int file_hash_crc(
HANDLE hFile; HANDLE hFile;
OVERLAPPED ol; OVERLAPPED ol;
#ifdef TIMER #ifdef TIMER
time1_start = GetTickCount(); time1_start = clock();
#endif #endif
// ソース・ファイルを開く // ソース・ファイルを開く
@@ -699,11 +702,11 @@ time1_start = GetTickCount();
if (file_left < io_size) if (file_left < io_size)
read_size = (unsigned int)file_left; read_size = (unsigned int)file_left;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf1, read_size, NULL, &ol); off = ReadFile(hFile, buf1, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -729,11 +732,11 @@ time2_total += GetTickCount() - time_start;
ol.OffsetHigh = (unsigned int)(file_off >> 32); ol.OffsetHigh = (unsigned int)(file_off >> 32);
file_off += io_size; file_off += io_size;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf, read_size, NULL, &ol); off = ReadFile(hFile, buf, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -749,7 +752,7 @@ time2_total += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = 0; // チェックサム計算 off = 0; // チェックサム計算
if (block_left > 0){ // 前回足りなかった分を追加する if (block_left > 0){ // 前回足りなかった分を追加する
@@ -786,7 +789,7 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time3_total += GetTickCount() - time_start; time3_total += clock() - time_start;
#endif #endif
// 経過表示 // 経過表示
@@ -819,16 +822,17 @@ error_end:
_aligned_free(buf1); _aligned_free(buf1);
#ifdef TIMER #ifdef TIMER
time_total += GetTickCount() - time1_start; time_total += clock() - time1_start;
if (*prog_now == total_file_size){ if (*prog_now == total_file_size){
printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
if (time_total > 0){ time_sec = (double)time_total / CLOCKS_PER_SEC;
time_start = (int)((total_file_size * 125) / ((__int64)time_total * 131072)); if (time_sec > 0){
time_speed = (double)total_file_size / (time_sec * 1048576);
} else { } else {
time_start = 0; time_speed = 0;
} }
printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start); printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
} }
#endif #endif
return err; return err;
@@ -1038,7 +1042,7 @@ int file_hash_check(
PHMD5 hash_ctx, block_ctx; PHMD5 hash_ctx, block_ctx;
OVERLAPPED ol; OVERLAPPED ol;
#ifdef TIMER #ifdef TIMER
time1_start = GetTickCount(); time1_start = clock();
#endif #endif
prog_last = -1; // 検証中のファイル名を毎回表示する prog_last = -1; // 検証中のファイル名を毎回表示する
@@ -1062,11 +1066,11 @@ time1_start = GetTickCount();
file_left = file_size - 16384; // 本来のファイル・サイズまでしか検査しない file_left = file_size - 16384; // 本来のファイル・サイズまでしか検査しない
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf, len, NULL, &ol); off = ReadFile(hFile, buf, len, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -1141,11 +1145,11 @@ time2_total += GetTickCount() - time_start;
if (file_left < IO_SIZE) if (file_left < IO_SIZE)
read_size = (unsigned int)file_left; read_size = (unsigned int)file_left;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf1, read_size, NULL, &ol); off = ReadFile(hFile, buf1, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -1168,11 +1172,11 @@ time2_total += GetTickCount() - time_start;
ol.OffsetHigh = (unsigned int)(file_off >> 32); ol.OffsetHigh = (unsigned int)(file_off >> 32);
file_off += IO_SIZE; file_off += IO_SIZE;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf, read_size, NULL, &ol); off = ReadFile(hFile, buf, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -1187,7 +1191,7 @@ time2_total += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
if (s_blk != NULL){ if (s_blk != NULL){
off = 0; off = 0;
@@ -1230,7 +1234,7 @@ time_start = GetTickCount();
Phmd5Process(&hash_ctx, buf, len); // MD5 計算 Phmd5Process(&hash_ctx, buf, len); // MD5 計算
} }
#ifdef TIMER #ifdef TIMER
time3_total += GetTickCount() - time_start; time3_total += clock() - time_start;
#endif #endif
// 経過表示 // 経過表示
@@ -1267,15 +1271,16 @@ error_end:
CloseHandle(ol.hEvent); CloseHandle(ol.hEvent);
#ifdef TIMER #ifdef TIMER
time_total += GetTickCount() - time1_start; time_total += clock() - time1_start;
printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
if (time_total > 0){ time_sec = (double)time_total / CLOCKS_PER_SEC;
time_start = (int)((file_size * 125) / ((__int64)time_total * 131072)); if (time_sec > 0){
time_speed = (double)file_size / (time_sec * 1048576);
} else { } else {
time_start = 0; time_speed = 0;
} }
printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start); printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
#endif #endif
return comp_num; return comp_num;
} }
@@ -1536,7 +1541,7 @@ int file_hash_direct(
HANDLE hFile; HANDLE hFile;
OVERLAPPED ol; OVERLAPPED ol;
#ifdef TIMER #ifdef TIMER
time1_start = GetTickCount(); time1_start = clock();
#endif #endif
prog_last = -1; // 検証中のファイル名を毎回表示する prog_last = -1; // 検証中のファイル名を毎回表示する
@@ -1592,11 +1597,11 @@ time1_start = GetTickCount();
file_left = file_size - 16384; // 本来のファイル・サイズまでしか検査しない file_left = file_size - 16384; // 本来のファイル・サイズまでしか検査しない
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf, read_size, NULL, &ol); off = ReadFile(hFile, buf, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
comp_num = -1; comp_num = -1;
@@ -1679,11 +1684,11 @@ time2_total += GetTickCount() - time_start;
read_size = (read_size + 4095) & ~4095; // 4KB の倍数にする read_size = (read_size + 4095) & ~4095; // 4KB の倍数にする
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf1, read_size, NULL, &ol); off = ReadFile(hFile, buf1, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -1710,11 +1715,11 @@ time2_total += GetTickCount() - time_start;
ol.OffsetHigh = (unsigned int)(file_off >> 32); ol.OffsetHigh = (unsigned int)(file_off >> 32);
file_off += IO_SIZE; file_off += IO_SIZE;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
off = ReadFile(hFile, buf, read_size, NULL, &ol); off = ReadFile(hFile, buf, read_size, NULL, &ol);
#ifdef TIMER #ifdef TIMER
time2_total += GetTickCount() - time_start; time2_total += clock() - time_start;
#endif #endif
if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
print_win32_err(); print_win32_err();
@@ -1729,7 +1734,7 @@ time2_total += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
if (s_blk != NULL){ if (s_blk != NULL){
off = 0; off = 0;
@@ -1771,7 +1776,7 @@ time_start = GetTickCount();
Phmd5Process(&hash_ctx, buf, len); // MD5 計算 Phmd5Process(&hash_ctx, buf, len); // MD5 計算
} }
#ifdef TIMER #ifdef TIMER
time3_total += GetTickCount() - time_start; time3_total += clock() - time_start;
#endif #endif
// 経過表示 // 経過表示
@@ -1812,10 +1817,16 @@ error_end:
_aligned_free(buf1); _aligned_free(buf1);
#ifdef TIMER #ifdef TIMER
time_total += GetTickCount() - time1_start; time_total += clock() - time1_start;
printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
printf("total %d.%03d sec\n", time_total / 1000, time_total % 1000); time_sec = (double)time_total / CLOCKS_PER_SEC;
if (time_sec > 0){
time_speed = (double)file_size / (time_sec * 1048576);
} else {
time_speed = 0;
}
printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
#endif #endif
return comp_num; return comp_num;
} }

View File

@@ -1,5 +1,5 @@
// par2_cmd.c // par2_cmd.c
// Copyright : 2023-10-15 Yutaka Sawada // Copyright : 2023-12-09 Yutaka Sawada
// License : GPL // License : GPL
#ifndef _UNICODE #ifndef _UNICODE
@@ -1479,14 +1479,12 @@ ri= switch_set & 0x00040000
} else if (wcsncmp(tmp_p, L"lc", 2) == 0){ } else if (wcsncmp(tmp_p, L"lc", 2) == 0){
k = 0; k = 0;
j = 2; j = 2;
while ((j < 2 + 5) && (tmp_p[j] >= '0') && (tmp_p[j] <= '9')){ while ((j < 2 + 7) && (tmp_p[j] >= '0') && (tmp_p[j] <= '9')){
k = (k * 10) + (tmp_p[j] - '0'); k = (k * 10) + (tmp_p[j] - '0');
j++; j++;
} }
if (k & 256){ // GPU を使う if (k & 0x300){ // GPU を使う
OpenCL_method = 1; // Faster GPU OpenCL_method = k & 0x003F0300;
} else if (k & 512){
OpenCL_method = -1; // Slower GPU
} }
if (k & 1024) // CLMUL と ALTMAP を使わない if (k & 1024) // CLMUL と ALTMAP を使わない
cpu_flag = (cpu_flag & 0xFFFFFFF7) | 256; cpu_flag = (cpu_flag & 0xFFFFFFF7) | 256;

View File

@@ -1,5 +1,5 @@
// reedsolomon.c // reedsolomon.c
// Copyright : 2023-10-26 Yutaka Sawada // Copyright : 2023-12-12 Yutaka Sawada
// License : GPL // License : GPL
#ifndef _UNICODE #ifndef _UNICODE
@@ -27,6 +27,9 @@
#include "rs_decode.h" #include "rs_decode.h"
#include "reedsolomon.h" #include "reedsolomon.h"
#ifdef TIMER
#include <time.h>
#endif
// GPU を使う最小データサイズ (MB 単位) // GPU を使う最小データサイズ (MB 単位)
// GPU の起動には時間がかかるので、データが小さすぎると逆に遅くなる // GPU の起動には時間がかかるので、データが小さすぎると逆に遅くなる
@@ -739,7 +742,7 @@ int rs_encode(
int err = 0; int err = 0;
unsigned int len; unsigned int len;
#ifdef TIMER #ifdef TIMER
unsigned int time_total = GetTickCount(); clock_t time_total = clock();
#endif #endif
if (galois_create_table()){ if (galois_create_table()){
@@ -755,7 +758,7 @@ unsigned int time_total = GetTickCount();
// パリティ計算用の行列演算の準備をする // パリティ計算用の行列演算の準備をする
len = sizeof(unsigned short) * source_num; len = sizeof(unsigned short) * source_num;
if (OpenCL_method != 0) if (OpenCL_method != 0)
len *= 2; // GPU の作業領域も確保しておく len *= 3; // GPU の作業領域も確保しておく
constant = malloc(len); constant = malloc(len);
if (constant == NULL){ if (constant == NULL){
printf("malloc, %d\n", len); printf("malloc, %d\n", len);
@@ -799,8 +802,8 @@ unsigned int time_total = GetTickCount();
err = encode_method2(file_path, header_buf, rcv_hFile, files, s_blk, p_blk, constant); err = encode_method2(file_path, header_buf, rcv_hFile, files, s_blk, p_blk, constant);
#ifdef TIMER #ifdef TIMER
if (err != 1){ if (err != 1){
time_total = GetTickCount() - time_total; time_total = clock() - time_total;
printf("total %d.%03d sec\n", time_total / 1000, time_total % 1000); printf("total %.3f sec\n", (double)time_total / CLOCKS_PER_SEC);
} }
#endif #endif
@@ -830,7 +833,7 @@ int rs_encode_1pass(
int err = 0; int err = 0;
unsigned int len; unsigned int len;
#ifdef TIMER #ifdef TIMER
unsigned int time_total = GetTickCount(); clock_t time_total = clock();
#endif #endif
if (galois_create_table()){ if (galois_create_table()){
@@ -841,7 +844,7 @@ unsigned int time_total = GetTickCount();
// パリティ計算用の行列演算の準備をする // パリティ計算用の行列演算の準備をする
len = sizeof(unsigned short) * source_num; len = sizeof(unsigned short) * source_num;
if (OpenCL_method != 0) if (OpenCL_method != 0)
len *= 2; // GPU の作業領域も確保しておく len *= 3; // GPU の作業領域も確保しておく
constant = malloc(len); constant = malloc(len);
if (constant == NULL){ if (constant == NULL){
printf("malloc, %d\n", len); printf("malloc, %d\n", len);
@@ -888,8 +891,8 @@ unsigned int time_total = GetTickCount();
if (err < 0){ if (err < 0){
printf("switching to 2-pass processing, %d\n", err); printf("switching to 2-pass processing, %d\n", err);
} else if (err != 1){ } else if (err != 1){
time_total = GetTickCount() - time_total; time_total = clock() - time_total;
printf("total %d.%03d sec\n", time_total / 1000, time_total % 1000); printf("total %.3f sec\n", (double)time_total / CLOCKS_PER_SEC);
} }
#endif #endif
@@ -913,7 +916,7 @@ int rs_decode(
int err = 0, i, j, k; int err = 0, i, j, k;
unsigned int len; unsigned int len;
#ifdef TIMER #ifdef TIMER
unsigned int time_matrix = 0, time_total = GetTickCount(); clock_t time_matrix = 0, time_total = clock();
#endif #endif
if (galois_create_table()){ if (galois_create_table()){
@@ -948,7 +951,7 @@ unsigned int time_matrix = 0, time_total = GetTickCount();
id = mat + (block_lost * source_num); id = mat + (block_lost * source_num);
#ifdef TIMER #ifdef TIMER
time_matrix = GetTickCount(); time_matrix = clock();
#endif #endif
// 復元用の行列を計算する // 復元用の行列を計算する
print_progress_text(0, "Computing matrix"); print_progress_text(0, "Computing matrix");
@@ -989,7 +992,7 @@ time_matrix = GetTickCount();
//for (i = 0; i < block_lost; i++) //for (i = 0; i < block_lost; i++)
// printf("id[%d] = %d\n", i, id[i]); // printf("id[%d] = %d\n", i, id[i]);
#ifdef TIMER #ifdef TIMER
time_matrix = GetTickCount() - time_matrix; time_matrix = clock() - time_matrix;
#endif #endif
#ifdef TIMER #ifdef TIMER
@@ -1032,9 +1035,9 @@ time_matrix = GetTickCount() - time_matrix;
err = decode_method2(file_path, block_lost, rcv_hFile, files, s_blk, p_blk, mat); err = decode_method2(file_path, block_lost, rcv_hFile, files, s_blk, p_blk, mat);
#ifdef TIMER #ifdef TIMER
if (err != 1){ if (err != 1){
time_total = GetTickCount() - time_total; time_total = clock() - time_total;
printf("total %d.%03d sec\n", time_total / 1000, time_total % 1000); printf("total %.3f sec\n", (double)time_total / CLOCKS_PER_SEC);
printf("matrix %d.%03d sec\n", time_matrix / 1000, time_matrix % 1000); printf("matrix %.3f sec\n", (double)time_matrix / CLOCKS_PER_SEC);
} }
#endif #endif

View File

@@ -1,5 +1,5 @@
// rs_decode.c // rs_decode.c
// Copyright : 2023-11-27 Yutaka Sawada // Copyright : 2023-12-13 Yutaka Sawada
// License : GPL // License : GPL
#ifndef _UNICODE #ifndef _UNICODE
@@ -28,7 +28,9 @@
#ifdef TIMER #ifdef TIMER
static unsigned int time_start, time_read = 0, time_write = 0, time_calc = 0; #include <time.h>
static double time_sec, time_speed;
static clock_t time_start, time_read = 0, time_write = 0, time_calc = 0;
static unsigned int read_count, write_count = 0, skip_count; static unsigned int read_count, write_count = 0, skip_count;
#endif #endif
@@ -60,7 +62,7 @@ static DWORD WINAPI thread_decode2(LPVOID lpParameter)
RS_TH *th; RS_TH *th;
#ifdef TIMER #ifdef TIMER
unsigned int loop_count2a = 0, loop_count2b = 0; unsigned int loop_count2a = 0, loop_count2b = 0;
unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; clock_t time_start2, time_encode2a = 0, time_encode2b = 0;
#endif #endif
th = (RS_TH *)lpParameter; th = (RS_TH *)lpParameter;
@@ -78,7 +80,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ
while (th->now < INT_MAX / 2){ while (th->now < INT_MAX / 2){
#ifdef TIMER #ifdef TIMER
time_start2 = GetTickCount(); time_start2 = clock();
#endif #endif
s_buf = th->buf; s_buf = th->buf;
factor = th->mat; factor = th->mat;
@@ -95,7 +97,7 @@ loop_count2a++;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
time_encode2a += GetTickCount() - time_start2; time_encode2a += clock() - time_start2;
#endif #endif
} else { // 消失ブロックを部分的に保持する場合 } else { // 消失ブロックを部分的に保持する場合
// スレッドごとに復元する消失ブロックの chunk を変える // スレッドごとに復元する消失ブロックの chunk を変える
@@ -136,7 +138,7 @@ loop_count2b += src_num;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
time_encode2b += GetTickCount() - time_start2; time_encode2b += clock() - time_start2;
#endif #endif
} }
//_mm_sfence(); // メモリーへの書き込みを完了する //_mm_sfence(); // メモリーへの書き込みを完了する
@@ -146,19 +148,21 @@ time_encode2b += GetTickCount() - time_start2;
#ifdef TIMER #ifdef TIMER
loop_count2b /= chunk_num; // chunk数で割ってブロック数にする loop_count2b /= chunk_num; // chunk数で割ってブロック数にする
printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b); printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
if (time_encode2a > 0){ time_sec = (double)time_encode2a / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
if (loop_count2a > 0) if (loop_count2a > 0)
printf(" 1st decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i); printf(" 1st decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed);
if (time_encode2b > 0){ time_sec = (double)time_encode2b / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i); printf(" 2nd decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed);
#endif #endif
// 終了処理 // 終了処理
@@ -178,7 +182,7 @@ static DWORD WINAPI thread_decode3(LPVOID lpParameter)
RS_TH *th; RS_TH *th;
#ifdef TIMER #ifdef TIMER
unsigned int loop_count2a = 0, loop_count2b = 0; unsigned int loop_count2a = 0, loop_count2b = 0;
unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; clock_t time_start2, time_encode2a = 0, time_encode2b = 0;
#endif #endif
th = (RS_TH *)lpParameter; th = (RS_TH *)lpParameter;
@@ -197,7 +201,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ
while (th->now < INT_MAX / 2){ while (th->now < INT_MAX / 2){
#ifdef TIMER #ifdef TIMER
time_start2 = GetTickCount(); time_start2 = clock();
#endif #endif
s_buf = th->buf; s_buf = th->buf;
factor = th->mat; factor = th->mat;
@@ -214,7 +218,7 @@ loop_count2a++;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
time_encode2a += GetTickCount() - time_start2; time_encode2a += clock() - time_start2;
#endif #endif
} else { // 全ての消失ブロックを保持する場合 } else { // 全ての消失ブロックを保持する場合
// スレッドごとに復元する消失ブロックの chunk を変える // スレッドごとに復元する消失ブロックの chunk を変える
@@ -250,7 +254,7 @@ loop_count2b += src_num;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
time_encode2b += GetTickCount() - time_start2; time_encode2b += clock() - time_start2;
#endif #endif
} }
//_mm_sfence(); // メモリーへの書き込みを完了する //_mm_sfence(); // メモリーへの書き込みを完了する
@@ -260,19 +264,21 @@ time_encode2b += GetTickCount() - time_start2;
#ifdef TIMER #ifdef TIMER
loop_count2b /= chunk_num; // chunk数で割ってブロック数にする loop_count2b /= chunk_num; // chunk数で割ってブロック数にする
printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b); printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
if (time_encode2a > 0){ time_sec = (double)time_encode2a / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
if (loop_count2a > 0) if (loop_count2a > 0)
printf(" 1st decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i); printf(" 1st decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed);
if (time_encode2b > 0){ time_sec = (double)time_encode2b / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i); printf(" 2nd decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed);
#endif #endif
// 終了処理 // 終了処理
@@ -292,7 +298,8 @@ static DWORD WINAPI thread_decode_gpu(LPVOID lpParameter)
HANDLE hRun, hEnd; HANDLE hRun, hEnd;
RS_TH *th; RS_TH *th;
#ifdef TIMER #ifdef TIMER
unsigned int time_start2, time_encode2 = 0, loop_count2 = 0; unsigned int loop_count2 = 0;
clock_t time_start2, time_encode2 = 0;
#endif #endif
th = (RS_TH *)lpParameter; th = (RS_TH *)lpParameter;
@@ -307,7 +314,7 @@ unsigned int time_start2, time_encode2 = 0, loop_count2 = 0;
WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ
while (th->now < INT_MAX / 2){ while (th->now < INT_MAX / 2){
#ifdef TIMER #ifdef TIMER
time_start2 = GetTickCount(); time_start2 = clock();
#endif #endif
// GPUはソース・ブロック読み込み中に呼ばれない // GPUはソース・ブロック読み込み中に呼ばれない
s_buf = th->buf; s_buf = th->buf;
@@ -321,10 +328,14 @@ time_start2 = GetTickCount();
InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する
} }
// スレッドごとに復元する消失ブロックを変え // 一つの GPUスレッドが全ての消失ブロックを処理す
while ((j = InterlockedIncrement(&(th->now))) < block_lost){ // j = ++th_now if (OpenCL_method & 8){ // 2ブロックずつ計算する
// 消失ブロック数が奇数なら、最初の一個だけ別に計算する
if (block_lost & 1){
InterlockedIncrement(&(th->now)); // 常に j = 0 となる
// 倍率は逆行列から部分的にコピーする // 倍率は逆行列から部分的にコピーする
i = gpu_multiply_blocks(src_num, factor + source_num * j, g_buf + (size_t)unit_size * j, unit_size); i = gpu_multiply_blocks(src_num, factor, NULL, g_buf, unit_size);
if (i != 0){ if (i != 0){
th->len = i; th->len = i;
InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する
@@ -335,8 +346,40 @@ time_start2 = GetTickCount();
loop_count2 += src_num; loop_count2 += src_num;
#endif #endif
} }
// 残りのブロックは二個ずつ計算する
while ((j = InterlockedAdd(&(th->now), 2)) < block_lost){ // th_now += 2, j = th_now
j--; // +2 してるから、最初のブロックは -1 する
// 倍率は逆行列から部分的に2回コピーする
i = gpu_multiply_blocks(src_num, factor + source_num * j, factor + source_num * (j + 1), g_buf + (size_t)unit_size * j, unit_size * 2);
if (i != 0){
th->len = i;
InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する
break;
}
#ifdef TIMER #ifdef TIMER
time_encode2 += GetTickCount() - time_start2; loop_count2 += src_num * 2;
#endif
}
} else { // 以前からの1ブロックずつ計算する方式
while ((j = InterlockedIncrement(&(th->now))) < block_lost){ // j = ++th_now
// 倍率は逆行列から部分的にコピーする(2ブロックずつの場合はブロック数をマイナスにする)
i = gpu_multiply_blocks(src_num, factor + source_num * j, NULL, g_buf + (size_t)unit_size * j, unit_size);
if (i != 0){
th->len = i;
InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する
break;
}
#ifdef TIMER
loop_count2 += src_num;
#endif
}
}
#ifdef TIMER
time_encode2 += clock() - time_start2;
#endif #endif
// 最後にVRAMを解放する // 最後にVRAMを解放する
i = gpu_finish(); i = gpu_finish();
@@ -349,12 +392,13 @@ time_encode2 += GetTickCount() - time_start2;
} }
#ifdef TIMER #ifdef TIMER
printf("gpu-thread :\n"); printf("gpu-thread :\n");
if (time_encode2 > 0){ time_sec = (double)time_encode2 / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2 * unit_size * 125 / ((__int64)time_encode2 * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2 * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2 / 1000, time_encode2 % 1000, loop_count2, i); printf(" 2nd decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2, time_speed);
#endif #endif
// 終了処理 // 終了処理
@@ -430,7 +474,7 @@ int decode_method1( // ソース・ブロックが一個だけの場合
block_off = 0; block_off = 0;
while (block_off < block_size){ while (block_off < block_size){
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// パリティ・ブロックを読み込む // パリティ・ブロックを読み込む
len = block_size - block_off; len = block_size - block_off;
@@ -447,18 +491,18 @@ time_start = GetTickCount();
// パリティ・ブロックのチェックサムを計算する // パリティ・ブロックのチェックサムを計算する
checksum16_altmap(buf, buf + io_size, io_size); checksum16_altmap(buf, buf + io_size, io_size);
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// 失われたソース・ブロックを復元する // 失われたソース・ブロックを復元する
memset(work_buf, 0, unit_size); memset(work_buf, 0, unit_size);
// factor で割ると元に戻る // factor で割ると元に戻る
galois_align_multiply(buf, work_buf, unit_size, galois_divide(1, galois_power(2, id))); galois_align_multiply(buf, work_buf, unit_size, galois_divide(1, galois_power(2, id)));
#ifdef TIMER #ifdef TIMER
time_calc += GetTickCount() - time_start; time_calc += clock() - time_start;
#endif #endif
// 経過表示 // 経過表示
@@ -472,7 +516,7 @@ time_calc += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// 復元されたソース・ブロックのチェックサムを検証する // 復元されたソース・ブロックのチェックサムを検証する
checksum16_return(work_buf, hash, io_size); checksum16_return(work_buf, hash, io_size);
@@ -491,7 +535,7 @@ time_start = GetTickCount();
goto error_end; goto error_end;
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
block_off += io_size; block_off += io_size;
@@ -499,9 +543,9 @@ time_write += GetTickCount() - time_start;
print_progress_done(); // 末尾ブロックの断片化によっては 100% で完了するとは限らない print_progress_done(); // 末尾ブロックの断片化によっては 100% で完了するとは限らない
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
printf("decode %d.%03d sec\n", time_calc / 1000, time_calc % 1000); printf("decode %.3f sec\n", (double)time_calc / CLOCKS_PER_SEC);
#endif #endif
error_end: error_end:
@@ -623,7 +667,7 @@ int decode_method2( // ソース・データを全て読み込む場合
#ifdef TIMER #ifdef TIMER
read_count = 0; read_count = 0;
skip_count = 0; skip_count = 0;
time_start = GetTickCount(); time_start = clock();
#endif #endif
last_file = -1; last_file = -1;
recv_now = 0; // 何番目の代替ブロックか recv_now = 0; // 何番目の代替ブロックか
@@ -760,7 +804,7 @@ skip_count++;
hFile = NULL; hFile = NULL;
} }
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ
@@ -845,7 +889,7 @@ skip_count++;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// 復元されたブロックを書き込む // 復元されたブロックを書き込む
work_buf = p_buf; work_buf = p_buf;
@@ -916,7 +960,7 @@ write_count++;
} }
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
part_off += part_num; // 次の消失ブロック位置にする part_off += part_num; // 次の消失ブロック位置にする
@@ -930,9 +974,9 @@ time_write += GetTickCount() - time_start;
print_progress_done(); print_progress_done();
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
j = ((block_size + io_size - 1) / io_size) * block_lost; j = ((block_size + io_size - 1) / io_size) * block_lost;
printf("write %d.%03d sec, count = %d/%d\n", time_write / 1000, time_write % 1000, write_count, j); printf("write %.3f sec, count = %d/%d\n", (double)time_write / CLOCKS_PER_SEC, write_count, j);
if (prog_num != prog_base) if (prog_num != prog_base)
printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
#endif #endif
@@ -1063,7 +1107,7 @@ int decode_method3( // 復元するブロックを全て保持できる場合
#ifdef TIMER #ifdef TIMER
read_count = 0; read_count = 0;
time_start = GetTickCount(); time_start = clock();
#endif #endif
last_file = -1; last_file = -1;
for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく
@@ -1173,7 +1217,7 @@ read_count++;
hFile = NULL; hFile = NULL;
} }
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ
@@ -1238,7 +1282,7 @@ time_read += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// 復元されたブロックを書き込む // 復元されたブロックを書き込む
work_buf = p_buf; work_buf = p_buf;
@@ -1297,7 +1341,7 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
// 最後の書き込みファイルを閉じる // 最後の書き込みファイルを閉じる
CloseHandle(hFile); CloseHandle(hFile);
@@ -1305,8 +1349,8 @@ time_write += GetTickCount() - time_start;
print_progress_done(); print_progress_done();
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
if (prog_num != prog_base) if (prog_num != prog_base)
printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
#endif #endif
@@ -1463,7 +1507,7 @@ int decode_method4( // 全てのブロックを断片的に保持する場合 (G
#ifdef TIMER #ifdef TIMER
read_count = 0; read_count = 0;
skip_count = 0; skip_count = 0;
time_start = GetTickCount(); time_start = clock();
#endif #endif
last_file = -1; last_file = -1;
recv_now = 0; // 何番目の代替ブロックか recv_now = 0; // 何番目の代替ブロックか
@@ -1600,7 +1644,7 @@ skip_count++;
hFile = NULL; hFile = NULL;
} }
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
memset(g_buf, 0, (size_t)unit_size * block_lost); // 待機中に GPU用の領域をゼロ埋めしておく memset(g_buf, 0, (size_t)unit_size * block_lost); // 待機中に GPU用の領域をゼロ埋めしておく
@@ -1845,7 +1889,7 @@ skip_count++;
prog_num += th->size * block_lost; prog_num += th->size * block_lost;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// 復元されたブロックを書き込む // 復元されたブロックを書き込む
work_buf = p_buf; work_buf = p_buf;
@@ -1918,7 +1962,7 @@ write_count++;
} }
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
block_off += io_size; block_off += io_size;
@@ -1929,9 +1973,9 @@ time_write += GetTickCount() - time_start;
print_progress_done(); print_progress_done();
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
j = ((block_size + io_size - 1) / io_size) * block_lost; j = ((block_size + io_size - 1) / io_size) * block_lost;
printf("write %d.%03d sec, count = %d/%d\n", time_write / 1000, time_write % 1000, write_count, j); printf("write %.3f sec, count = %d/%d\n", (double)time_write / CLOCKS_PER_SEC, write_count, j);
if (prog_num != prog_base) if (prog_num != prog_base)
printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
#endif #endif
@@ -2096,7 +2140,7 @@ int decode_method5( // 復元するブロックだけ保持する場合 (GPU対
#ifdef TIMER #ifdef TIMER
read_count = 0; read_count = 0;
time_start = GetTickCount(); time_start = clock();
#endif #endif
last_file = -1; last_file = -1;
for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく
@@ -2206,7 +2250,7 @@ read_count++;
hFile = NULL; hFile = NULL;
} }
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
if (source_off == 0) if (source_off == 0)
@@ -2446,7 +2490,7 @@ time_read += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// 復元されたブロックを書き込む // 復元されたブロックを書き込む
work_buf = p_buf; work_buf = p_buf;
@@ -2507,7 +2551,7 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
// 最後の書き込みファイルを閉じる // 最後の書き込みファイルを閉じる
CloseHandle(hFile); CloseHandle(hFile);
@@ -2515,8 +2559,8 @@ time_write += GetTickCount() - time_start;
print_progress_done(); print_progress_done();
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
if (prog_num != prog_base) if (prog_num != prog_base)
printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
#endif #endif

View File

@@ -1,5 +1,5 @@
// rs_encode.c // rs_encode.c
// Copyright : 2023-11-25 Yutaka Sawada // Copyright : 2023-12-18 Yutaka Sawada
// License : GPL // License : GPL
#ifndef _UNICODE #ifndef _UNICODE
@@ -29,7 +29,9 @@
#ifdef TIMER #ifdef TIMER
static unsigned int time_start, time_read = 0, time_write = 0, time_calc = 0; #include <time.h>
static double time_sec, time_speed;
static clock_t time_start, time_read = 0, time_write = 0, time_calc = 0;
static unsigned int read_count, skip_count; static unsigned int read_count, skip_count;
#endif #endif
@@ -61,7 +63,7 @@ static DWORD WINAPI thread_encode2(LPVOID lpParameter)
RS_TH *th; RS_TH *th;
#ifdef TIMER #ifdef TIMER
unsigned int loop_count2a = 0, loop_count2b = 0; unsigned int loop_count2a = 0, loop_count2b = 0;
unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; clock_t time_start2, time_encode2a = 0, time_encode2b = 0;
#endif #endif
th = (RS_TH *)lpParameter; th = (RS_TH *)lpParameter;
@@ -80,7 +82,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ
while (th->now < INT_MAX / 2){ while (th->now < INT_MAX / 2){
#ifdef TIMER #ifdef TIMER
time_start2 = GetTickCount(); time_start2 = clock();
#endif #endif
s_buf = th->buf; s_buf = th->buf;
src_off = th->off; // ソース・ブロック番号 src_off = th->off; // ソース・ブロック番号
@@ -98,7 +100,7 @@ loop_count2a++;
} }
#ifdef TIMER #ifdef TIMER
time_encode2a += GetTickCount() - time_start2; time_encode2a += clock() - time_start2;
#endif #endif
} else { // パリティ・ブロックを部分的に保持する場合 } else { // パリティ・ブロックを部分的に保持する場合
// スレッドごとに作成するパリティ・ブロックの chunk を変える // スレッドごとに作成するパリティ・ブロックの chunk を変える
@@ -143,7 +145,7 @@ loop_count2b += src_num;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
time_encode2b += GetTickCount() - time_start2; time_encode2b += clock() - time_start2;
#endif #endif
} }
//_mm_sfence(); // メモリーへの書き込みを完了する //_mm_sfence(); // メモリーへの書き込みを完了する
@@ -153,19 +155,21 @@ time_encode2b += GetTickCount() - time_start2;
#ifdef TIMER #ifdef TIMER
loop_count2b /= chunk_num; // chunk数で割ってブロック数にする loop_count2b /= chunk_num; // chunk数で割ってブロック数にする
printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b); printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
if (time_encode2a > 0){ time_sec = (double)time_encode2a / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
if (loop_count2a > 0) if (loop_count2a > 0)
printf(" 1st encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i); printf(" 1st encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed);
if (time_encode2b > 0){ time_sec = (double)time_encode2b / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i); printf(" 2nd encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed);
#endif #endif
// 終了処理 // 終了処理
@@ -185,7 +189,7 @@ static DWORD WINAPI thread_encode3(LPVOID lpParameter)
RS_TH *th; RS_TH *th;
#ifdef TIMER #ifdef TIMER
unsigned int loop_count2a = 0, loop_count2b = 0; unsigned int loop_count2a = 0, loop_count2b = 0;
unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; clock_t time_start2, time_encode2a = 0, time_encode2b = 0;
#endif #endif
th = (RS_TH *)lpParameter; th = (RS_TH *)lpParameter;
@@ -204,7 +208,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ
while (th->now < INT_MAX / 2){ while (th->now < INT_MAX / 2){
#ifdef TIMER #ifdef TIMER
time_start2 = GetTickCount(); time_start2 = clock();
#endif #endif
s_buf = th->buf; s_buf = th->buf;
src_off = th->off; // ソース・ブロック番号 src_off = th->off; // ソース・ブロック番号
@@ -221,7 +225,7 @@ loop_count2a++;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
time_encode2a += GetTickCount() - time_start2; time_encode2a += clock() - time_start2;
#endif #endif
} else { // 全てのパリティ・ブロックを保持する場合 } else { // 全てのパリティ・ブロックを保持する場合
// スレッドごとに作成するパリティ・ブロックの chunk を変える // スレッドごとに作成するパリティ・ブロックの chunk を変える
@@ -261,7 +265,7 @@ loop_count2b += src_num;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
time_encode2b += GetTickCount() - time_start2; time_encode2b += clock() - time_start2;
#endif #endif
} }
//_mm_sfence(); // メモリーへの書き込みを完了する //_mm_sfence(); // メモリーへの書き込みを完了する
@@ -271,19 +275,21 @@ time_encode2b += GetTickCount() - time_start2;
#ifdef TIMER #ifdef TIMER
loop_count2b /= chunk_num; // chunk数で割ってブロック数にする loop_count2b /= chunk_num; // chunk数で割ってブロック数にする
printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b); printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
if (time_encode2a > 0){ time_sec = (double)time_encode2a / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
if (loop_count2a > 0) if (loop_count2a > 0)
printf(" 1st encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i); printf(" 1st encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed);
if (time_encode2b > 0){ time_sec = (double)time_encode2b / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i); printf(" 2nd encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed);
#endif #endif
// 終了処理 // 終了処理
@@ -303,7 +309,8 @@ static DWORD WINAPI thread_encode_gpu(LPVOID lpParameter)
HANDLE hRun, hEnd; HANDLE hRun, hEnd;
RS_TH *th; RS_TH *th;
#ifdef TIMER #ifdef TIMER
unsigned int time_start2, time_encode2 = 0, loop_count2 = 0; unsigned int loop_count2 = 0;
clock_t time_start2, time_encode2 = 0;
#endif #endif
th = (RS_TH *)lpParameter; th = (RS_TH *)lpParameter;
@@ -320,7 +327,7 @@ unsigned int time_start2, time_encode2 = 0, loop_count2 = 0;
WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ
while (th->now < INT_MAX / 2){ while (th->now < INT_MAX / 2){
#ifdef TIMER #ifdef TIMER
time_start2 = GetTickCount(); time_start2 = clock();
#endif #endif
// GPUはソース・ブロック読み込み中に呼ばれない // GPUはソース・ブロック読み込み中に呼ばれない
s_buf = th->buf; s_buf = th->buf;
@@ -335,13 +342,17 @@ time_start2 = GetTickCount();
} }
// 一つの GPUスレッドが全てのパリティ・ブロックを処理する // 一つの GPUスレッドが全てのパリティ・ブロックを処理する
while ((j = InterlockedIncrement(&(th->now))) < parity_num){ // j = ++th_now if (OpenCL_method & 8){ // 2ブロックずつ計算する
// パリティ・ブロック数が奇数なら、最初の一個だけ別に計算する
if (parity_num & 1){
InterlockedIncrement(&(th->now)); // 常に j = 0 となる
// factor は定数行列の乗数になる // factor は定数行列の乗数になる
for (i = 0; i < src_num; i++) for (i = 0; i < src_num; i++)
factor[i] = galois_power(constant[src_off + i], first_num + j); factor[i] = galois_power(constant[src_off + i], first_num);
// VRAM上のソース・ブロックごとにパリティを追加していく // VRAM上のソース・ブロックごとにパリティを追加していく
i = gpu_multiply_blocks(src_num, factor, g_buf + (size_t)unit_size * j, unit_size); i = gpu_multiply_blocks(src_num, factor, NULL, g_buf, unit_size);
if (i != 0){ if (i != 0){
th->len = i; th->len = i;
InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する
@@ -351,8 +362,51 @@ time_start2 = GetTickCount();
loop_count2 += src_num; loop_count2 += src_num;
#endif #endif
} }
// 残りのブロックは二個ずつ計算する
while ((j = InterlockedAdd(&(th->now), 2)) < parity_num){ // th_now += 2, j = th_now
j--; // +2 してるから、最初のブロックは -1 する
// factor は定数行列の乗数になる
for (i = 0; i < src_num; i++){
int c = constant[src_off + i]; // 同じ定数だけど、何乗するかが異なる
factor[i] = galois_power(c, first_num + j);
factor[src_num + i] = galois_power(c, first_num + j + 1);
}
// VRAM上のソース・ブロックごとにパリティを追加していく
i = gpu_multiply_blocks(src_num, factor, (void *)1, g_buf + (size_t)unit_size * j, unit_size * 2);
if (i != 0){
th->len = i;
InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する
break;
}
#ifdef TIMER #ifdef TIMER
time_encode2 += GetTickCount() - time_start2; loop_count2 += src_num * 2;
#endif
}
} else { // 以前からの1ブロックずつ計算する方式
while ((j = InterlockedIncrement(&(th->now))) < parity_num){ // j = ++th_now
// factor は定数行列の乗数になる
for (i = 0; i < src_num; i++)
factor[i] = galois_power(constant[src_off + i], first_num + j);
// VRAM上のソース・ブロックごとにパリティを追加していく
i = gpu_multiply_blocks(src_num, factor, NULL, g_buf + (size_t)unit_size * j, unit_size);
if (i != 0){
th->len = i;
InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する
break;
}
#ifdef TIMER
loop_count2 += src_num;
#endif
}
}
#ifdef TIMER
time_encode2 += clock() - time_start2;
#endif #endif
// 最後にVRAMを解放する // 最後にVRAMを解放する
i = gpu_finish(); i = gpu_finish();
@@ -365,12 +419,13 @@ time_encode2 += GetTickCount() - time_start2;
} }
#ifdef TIMER #ifdef TIMER
printf("gpu-thread :\n"); printf("gpu-thread :\n");
if (time_encode2 > 0){ time_sec = (double)time_encode2 / CLOCKS_PER_SEC;
i = (int)((__int64)loop_count2 * unit_size * 125 / ((__int64)time_encode2 * 131072)); if (time_sec > 0){
time_speed = ((double)loop_count2 * unit_size) / (time_sec * 1048576);
} else { } else {
i = 0; time_speed = 0;
} }
printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2 / 1000, time_encode2 % 1000, loop_count2, i); printf(" 2nd encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2, time_speed);
#endif #endif
// 終了処理 // 終了処理
@@ -452,7 +507,7 @@ int encode_method1( // ソース・ブロックが一個だけの場合
block_off = 0; block_off = 0;
while (block_off < block_size){ while (block_off < block_size){
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// ソース・ブロックを読み込む // ソース・ブロックを読み込む
len = s_blk[0].size - block_off; len = s_blk[0].size - block_off;
@@ -469,7 +524,7 @@ time_start = GetTickCount();
s_blk[0].crc = crc_update(s_blk[0].crc, buf, len); // without pad s_blk[0].crc = crc_update(s_blk[0].crc, buf, len); // without pad
checksum16_altmap(buf, buf + io_size, io_size); checksum16_altmap(buf, buf + io_size, io_size);
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
// リカバリ・ファイルに書き込むサイズ // リカバリ・ファイルに書き込むサイズ
@@ -482,13 +537,13 @@ time_read += GetTickCount() - time_start;
// パリティ・ブロックごとに // パリティ・ブロックごとに
for (i = 0; i < parity_num; i++){ for (i = 0; i < parity_num; i++){
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
memset(work_buf, 0, unit_size); memset(work_buf, 0, unit_size);
// factor は 2の乗数になる // factor は 2の乗数になる
galois_align_multiply(buf, work_buf, unit_size, galois_power(2, first_num + i)); galois_align_multiply(buf, work_buf, unit_size, galois_power(2, first_num + i));
#ifdef TIMER #ifdef TIMER
time_calc += GetTickCount() - time_start; time_calc += clock() - time_start;
#endif #endif
// 経過表示 // 経過表示
@@ -502,7 +557,7 @@ time_calc += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// パリティ・ブロックのチェックサムを検証する // パリティ・ブロックのチェックサムを検証する
checksum16_return(work_buf, hash, io_size); checksum16_return(work_buf, hash, io_size);
@@ -535,7 +590,7 @@ time_start = GetTickCount();
goto error_end; goto error_end;
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
} }
@@ -565,7 +620,7 @@ time_write += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// 最後に Recovery Slice packet のヘッダーを書き込む // 最後に Recovery Slice packet のヘッダーを書き込む
for (i = 0; i < parity_num; i++){ for (i = 0; i < parity_num; i++){
@@ -581,14 +636,14 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
printf("encode %d.%03d sec\n", time_calc / 1000, time_calc % 1000); printf("encode %.3f sec\n", (double)time_calc / CLOCKS_PER_SEC);
#endif #endif
error_end: error_end:
@@ -729,7 +784,7 @@ int encode_method2( // ソース・データを全て読み込む場合
#ifdef TIMER #ifdef TIMER
read_count = 0; read_count = 0;
skip_count = 0; skip_count = 0;
time_start = GetTickCount(); time_start = clock();
#endif #endif
last_file = -1; last_file = -1;
for (i = 0; i < source_num; i++){ for (i = 0; i < source_num; i++){
@@ -830,7 +885,7 @@ skip_count++;
CloseHandle(hFile); CloseHandle(hFile);
hFile = NULL; hFile = NULL;
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ
@@ -930,7 +985,7 @@ skip_count++;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// パリティ・ブロックを書き込む // パリティ・ブロックを書き込む
work_buf = p_buf; work_buf = p_buf;
@@ -979,7 +1034,7 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
part_off += part_num; // 次のパリティ位置にする part_off += part_num; // 次のパリティ位置にする
@@ -1025,7 +1080,7 @@ time_write += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// 最後に Recovery Slice packet のヘッダーを書き込む // 最後に Recovery Slice packet のヘッダーを書き込む
for (i = 0; i < parity_num; i++){ for (i = 0; i < parity_num; i++){
@@ -1041,13 +1096,13 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
if (prog_num != prog_base) if (prog_num != prog_base)
printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
#endif #endif
@@ -1186,7 +1241,7 @@ int encode_method3( // パリティ・ブロックを全て保持して、一度
src_off = source_off - 1; // まだ計算して無い印 src_off = source_off - 1; // まだ計算して無い印
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく
// ソース・ブロックを読み込む // ソース・ブロックを読み込む
@@ -1318,7 +1373,7 @@ time_start = GetTickCount();
memcpy(common_buf + packet_off + 16, file_md_ctx.hash, 16); memcpy(common_buf + packet_off + 16, file_md_ctx.hash, 16);
} }
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ
@@ -1393,19 +1448,19 @@ time_read += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
memcpy(common_buf + common_size, common_buf, common_size); // 後の半分に前半のをコピーする memcpy(common_buf + common_size, common_buf, common_size); // 後の半分に前半のをコピーする
// 最後にパリティ・ブロックのチェックサムを検証して、リカバリ・ファイルに書き込む // 最後にパリティ・ブロックのチェックサムを検証して、リカバリ・ファイルに書き込む
err = create_recovery_file_1pass(file_path, recovery_path, packet_limit, block_distri, err = create_recovery_file_1pass(file_path, recovery_path, packet_limit, block_distri,
packet_num, common_buf, common_size, footer_buf, footer_size, rcv_hFile, p_buf, NULL, unit_size); packet_num, common_buf, common_size, footer_buf, footer_size, rcv_hFile, p_buf, NULL, unit_size);
#ifdef TIMER #ifdef TIMER
time_write = GetTickCount() - time_start; time_write = clock() - time_start;
#endif #endif
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
if (prog_num != prog_base - prog_write * parity_num) if (prog_num != prog_base - prog_write * parity_num)
printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num); printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num);
#endif #endif
@@ -1577,7 +1632,7 @@ int encode_method4( // 全てのブロックを断片的に保持する場合 (G
#ifdef TIMER #ifdef TIMER
read_count = 0; read_count = 0;
skip_count = 0; skip_count = 0;
time_start = GetTickCount(); time_start = clock();
#endif #endif
last_file = -1; last_file = -1;
for (i = 0; i < source_num; i++){ for (i = 0; i < source_num; i++){
@@ -1678,7 +1733,7 @@ skip_count++;
CloseHandle(hFile); CloseHandle(hFile);
hFile = NULL; hFile = NULL;
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
memset(g_buf, 0, (size_t)unit_size * parity_num); // 待機中に GPU用の領域をゼロ埋めしておく memset(g_buf, 0, (size_t)unit_size * parity_num); // 待機中に GPU用の領域をゼロ埋めしておく
@@ -1931,7 +1986,7 @@ skip_count++;
prog_num += th->size * parity_num; prog_num += th->size * parity_num;
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// パリティ・ブロックを書き込む // パリティ・ブロックを書き込む
work_buf = p_buf; work_buf = p_buf;
@@ -1982,7 +2037,7 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
block_off += io_size; block_off += io_size;
@@ -2025,7 +2080,7 @@ time_write += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
// 最後に Recovery Slice packet のヘッダーを書き込む // 最後に Recovery Slice packet のヘッダーを書き込む
for (i = 0; i < parity_num; i++){ for (i = 0; i < parity_num; i++){
@@ -2041,13 +2096,13 @@ time_start = GetTickCount();
} }
} }
#ifdef TIMER #ifdef TIMER
time_write += GetTickCount() - time_start; time_write += clock() - time_start;
#endif #endif
} }
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
if (prog_num != prog_base) if (prog_num != prog_base)
printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
#endif #endif
@@ -2220,7 +2275,7 @@ int encode_method5( // ソース・ブロックの一部とパリティ・ブロ
src_off = source_off - 1; // まだ計算して無い印 src_off = source_off - 1; // まだ計算して無い印
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく
// ソース・ブロックを読み込む // ソース・ブロックを読み込む
@@ -2351,7 +2406,7 @@ time_start = GetTickCount();
memcpy(common_buf + packet_off + 16, file_md_ctx.hash, 16); memcpy(common_buf + packet_off + 16, file_md_ctx.hash, 16);
} }
#ifdef TIMER #ifdef TIMER
time_read += GetTickCount() - time_start; time_read += clock() - time_start;
#endif #endif
if (source_off == 0) if (source_off == 0)
@@ -2589,19 +2644,19 @@ time_read += GetTickCount() - time_start;
} }
#ifdef TIMER #ifdef TIMER
time_start = GetTickCount(); time_start = clock();
#endif #endif
memcpy(common_buf + common_size, common_buf, common_size); // 後の半分に前半のをコピーする memcpy(common_buf + common_size, common_buf, common_size); // 後の半分に前半のをコピーする
// 最後にパリティ・ブロックのチェックサムを検証して、リカバリ・ファイルに書き込む // 最後にパリティ・ブロックのチェックサムを検証して、リカバリ・ファイルに書き込む
err = create_recovery_file_1pass(file_path, recovery_path, packet_limit, block_distri, err = create_recovery_file_1pass(file_path, recovery_path, packet_limit, block_distri,
packet_num, common_buf, common_size, footer_buf, footer_size, rcv_hFile, p_buf, g_buf, unit_size); packet_num, common_buf, common_size, footer_buf, footer_size, rcv_hFile, p_buf, g_buf, unit_size);
#ifdef TIMER #ifdef TIMER
time_write = GetTickCount() - time_start; time_write = clock() - time_start;
#endif #endif
#ifdef TIMER #ifdef TIMER
printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
if (prog_num != prog_base - prog_write * parity_num) if (prog_num != prog_base - prog_write * parity_num)
printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num); printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num);
#endif #endif

View File

@@ -1,10 +1,14 @@
void calc_table(__local uint *mtab, int id, int factor) void calc_table(__local uint *mtab, int id, int factor)
{ {
int i, sum = 0; int i, sum, mask;
for (i = 0; i < 8; i++){ mask = (id & 1) ? 0xFFFF : 0;
sum = (id & (1 << i)) ? (sum ^ factor) : sum; sum = mask & factor;
factor = (factor & 0x8000) ? ((factor << 1) ^ 0x1100B) : (factor << 1); for (i = 1; i < 8; i++){
mask = (factor & 0x8000) ? 0x1100B : 0;
factor = (factor << 1) ^ mask;
mask = (id & (1 << i)) ? 0xFFFF : 0;
sum ^= mask & factor;
} }
mtab[id] = sum; mtab[id] = sum;
@@ -14,6 +18,32 @@ void calc_table(__local uint *mtab, int id, int factor)
mtab[id + 256] = sum; mtab[id + 256] = sum;
} }
void calc_table2(__local uint *mtab, int id, int factor, int factor2)
{
int i, sum, sum2, mask;
mask = (id & 1) ? 0xFFFF : 0;
sum = mask & factor;
sum2 = mask & factor2;
for (i = 1; i < 8; i++){
mask = (factor & 0x8000) ? 0x1100B : 0;
factor = (factor << 1) ^ mask;
mask = (factor2 & 0x8000) ? 0x1100B : 0;
factor2 = (factor2 << 1) ^ mask;
mask = (id & (1 << i)) ? 0xFFFF : 0;
sum ^= mask & factor;
sum2 ^= mask & factor2;
}
mtab[id] = sum | (sum2 << 16);
sum = (sum << 4) ^ (((sum << 16) >> 31) & 0x88058) ^ (((sum << 17) >> 31) & 0x4402C) ^ (((sum << 18) >> 31) & 0x22016) ^ (((sum << 19) >> 31) & 0x1100B);
sum = (sum << 4) ^ (((sum << 16) >> 31) & 0x88058) ^ (((sum << 17) >> 31) & 0x4402C) ^ (((sum << 18) >> 31) & 0x22016) ^ (((sum << 19) >> 31) & 0x1100B);
sum2 = (sum2 << 4) ^ (((sum2 << 16) >> 31) & 0x88058) ^ (((sum2 << 17) >> 31) & 0x4402C) ^ (((sum2 << 18) >> 31) & 0x22016) ^ (((sum2 << 19) >> 31) & 0x1100B);
sum2 = (sum2 << 4) ^ (((sum2 << 16) >> 31) & 0x88058) ^ (((sum2 << 17) >> 31) & 0x4402C) ^ (((sum2 << 18) >> 31) & 0x22016) ^ (((sum2 << 19) >> 31) & 0x1100B);
mtab[id + 256] = sum | (sum2 << 16);
}
__kernel void method1( __kernel void method1(
__global uint *src, __global uint *src,
__global uint *dst, __global uint *dst,
@@ -31,6 +61,7 @@ __kernel void method1(
dst[i] = 0; dst[i] = 0;
for (blk = 0; blk < blk_num; blk++){ for (blk = 0; blk < blk_num; blk++){
barrier(CLK_LOCAL_MEM_FENCE);
calc_table(mtab, table_id, factors[blk]); calc_table(mtab, table_id, factors[blk]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -42,7 +73,6 @@ __kernel void method1(
dst[i] ^= sum; dst[i] ^= sum;
} }
src += BLK_SIZE; src += BLK_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
} }
} }
@@ -65,6 +95,7 @@ __kernel void method2(
} }
for (blk = 0; blk < blk_num; blk++){ for (blk = 0; blk < blk_num; blk++){
barrier(CLK_LOCAL_MEM_FENCE);
calc_table(mtab, table_id, factors[blk]); calc_table(mtab, table_id, factors[blk]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -82,11 +113,10 @@ __kernel void method2(
dst[pos + 4] ^= ((sum1 & 0xFF00FF00) >> 8) | (sum2 & 0xFF00FF00); dst[pos + 4] ^= ((sum1 & 0xFF00FF00) >> 8) | (sum2 & 0xFF00FF00);
} }
src += BLK_SIZE; src += BLK_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
} }
} }
__kernel void method3( __kernel void method4(
__global uint4 *src, __global uint4 *src,
__global uint4 *dst, __global uint4 *dst,
__global ushort *factors, __global ushort *factors,
@@ -106,6 +136,7 @@ __kernel void method3(
} }
for (blk = 0; blk < blk_num; blk++){ for (blk = 0; blk < blk_num; blk++){
barrier(CLK_LOCAL_MEM_FENCE);
calc_table(mtab, table_id, factors[blk]); calc_table(mtab, table_id, factors[blk]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -124,11 +155,140 @@ __kernel void method3(
dst[i + 1] ^= as_uint4((uchar16)(r0.y, r0.w, r1.y, r1.w, r2.y, r2.w, r3.y, r3.w, r4.y, r4.w, r5.y, r5.w, r6.y, r6.w, r7.y, r7.w)); dst[i + 1] ^= as_uint4((uchar16)(r0.y, r0.w, r1.y, r1.w, r2.y, r2.w, r3.y, r3.w, r4.y, r4.w, r5.y, r5.w, r6.y, r6.w, r7.y, r7.w));
} }
src += BLK_SIZE / 4; src += BLK_SIZE / 4;
barrier(CLK_LOCAL_MEM_FENCE);
} }
} }
__kernel void method4( __kernel void method9(
__global uint *src,
__global uint *dst,
__global ushort *factors,
int blk_num)
{
__local uint mtab[512];
int i, blk;
uint v, sum, sum2;
const int work_id = get_global_id(0);
const int work_size = get_global_size(0);
const int table_id = get_local_id(0);
for (i = work_id; i < BLK_SIZE; i += work_size){
dst[i] = 0;
dst[i + BLK_SIZE] = 0;
}
for (blk = 0; blk < blk_num; blk++){
barrier(CLK_LOCAL_MEM_FENCE);
calc_table2(mtab, table_id, factors[blk], factors[blk_num + blk]);
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < BLK_SIZE; i += work_size){
v = src[i];
sum = mtab[(uchar)v] ^ mtab[256 + (uchar)(v >> 8)];
sum2 = mtab[(uchar)(v >> 16)] ^ mtab[256 + (v >> 24)];
dst[i] ^= (sum & 0xFFFF) | (sum2 << 16);
dst[i + BLK_SIZE] ^= (sum >> 16) | (sum2 & 0xFFFF0000);
}
src += BLK_SIZE;
}
}
__kernel void method10(
__global uint *src,
__global uint *dst,
__global ushort *factors,
int blk_num)
{
__local uint mtab[512];
int i, blk, pos;
uint lo, hi, sum1, sum2, sum3, sum4;
const int work_id = get_global_id(0) * 2;
const int work_size = get_global_size(0) * 2;
const int table_id = get_local_id(0);
for (i = work_id; i < BLK_SIZE; i += work_size){
dst[i ] = 0;
dst[i + 1] = 0;
dst[i + BLK_SIZE ] = 0;
dst[i + BLK_SIZE + 1] = 0;
}
for (blk = 0; blk < blk_num; blk++){
barrier(CLK_LOCAL_MEM_FENCE);
calc_table2(mtab, table_id, factors[blk], factors[blk_num + blk]);
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < BLK_SIZE; i += work_size){
pos = (i & ~7) + ((i & 7) >> 1);
lo = src[pos ];
hi = src[pos + 4];
sum1 = mtab[(uchar)lo] ^ mtab[256 + (uchar)hi];
sum2 = mtab[(uchar)(lo >> 8)] ^ mtab[256 + (uchar)(hi >> 8)];
sum3 = mtab[(uchar)(lo >> 16)] ^ mtab[256 + (uchar)(hi >> 16)];
sum4 = mtab[lo >> 24] ^ mtab[256 + (hi >> 24)];
dst[pos ] ^= (sum1 & 0xFF) | ((sum2 & 0xFF) << 8) | ((sum3 & 0xFF) << 16) | (sum4 << 24);
dst[pos + 4] ^= ((sum1 >> 8) & 0xFF) | (sum2 & 0xFF00) | ((sum3 & 0xFF00) << 8) | ((sum4 & 0xFF00) << 16);
dst[pos + BLK_SIZE ] ^= ((sum1 >> 16) & 0xFF) | ((sum2 >> 8) & 0xFF00) | (sum3 & 0xFF0000) | ((sum4 & 0xFF0000) << 8);
dst[pos + BLK_SIZE + 4] ^= (sum1 >> 24) | ((sum2 >> 16) & 0xFF00) | ((sum3 >> 8) & 0xFF0000) | (sum4 & 0xFF000000);
}
src += BLK_SIZE;
}
}
__kernel void method12(
__global uint4 *src,
__global uint4 *dst,
__global ushort *factors,
int blk_num)
{
__local uint mtab[512];
int i, blk;
uchar4 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB, rC, rD, rE, rF;
uchar16 lo, hi;
const int work_id = get_global_id(0) * 2;
const int work_size = get_global_size(0) * 2;
const int table_id = get_local_id(0);
for (i = work_id; i < BLK_SIZE / 4; i += work_size){
dst[i ] = 0;
dst[i + 1] = 0;
dst[i + BLK_SIZE / 4 ] = 0;
dst[i + BLK_SIZE / 4 + 1] = 0;
}
for (blk = 0; blk < blk_num; blk++){
barrier(CLK_LOCAL_MEM_FENCE);
calc_table2(mtab, table_id, factors[blk], factors[blk_num + blk]);
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < BLK_SIZE / 4; i += work_size){
lo = as_uchar16(src[i ]);
hi = as_uchar16(src[i + 1]);
r0 = as_uchar4(mtab[lo.s0] ^ mtab[256 + hi.s0]);
r1 = as_uchar4(mtab[lo.s1] ^ mtab[256 + hi.s1]);
r2 = as_uchar4(mtab[lo.s2] ^ mtab[256 + hi.s2]);
r3 = as_uchar4(mtab[lo.s3] ^ mtab[256 + hi.s3]);
r4 = as_uchar4(mtab[lo.s4] ^ mtab[256 + hi.s4]);
r5 = as_uchar4(mtab[lo.s5] ^ mtab[256 + hi.s5]);
r6 = as_uchar4(mtab[lo.s6] ^ mtab[256 + hi.s6]);
r7 = as_uchar4(mtab[lo.s7] ^ mtab[256 + hi.s7]);
r8 = as_uchar4(mtab[lo.s8] ^ mtab[256 + hi.s8]);
r9 = as_uchar4(mtab[lo.s9] ^ mtab[256 + hi.s9]);
rA = as_uchar4(mtab[lo.sa] ^ mtab[256 + hi.sa]);
rB = as_uchar4(mtab[lo.sb] ^ mtab[256 + hi.sb]);
rC = as_uchar4(mtab[lo.sc] ^ mtab[256 + hi.sc]);
rD = as_uchar4(mtab[lo.sd] ^ mtab[256 + hi.sd]);
rE = as_uchar4(mtab[lo.se] ^ mtab[256 + hi.se]);
rF = as_uchar4(mtab[lo.sf] ^ mtab[256 + hi.sf]);
dst[i ] ^= as_uint4((uchar16)(r0.x, r1.x, r2.x, r3.x, r4.x, r5.x, r6.x, r7.x, r8.x, r9.x, rA.x, rB.x, rC.x, rD.x, rE.x, rF.x));
dst[i + 1] ^= as_uint4((uchar16)(r0.y, r1.y, r2.y, r3.y, r4.y, r5.y, r6.y, r7.y, r8.y, r9.y, rA.y, rB.y, rC.y, rD.y, rE.y, rF.y));
dst[i + BLK_SIZE / 4 ] ^= as_uint4((uchar16)(r0.z, r1.z, r2.z, r3.z, r4.z, r5.z, r6.z, r7.z, r8.z, r9.z, rA.z, rB.z, rC.z, rD.z, rE.z, rF.z));
dst[i + BLK_SIZE / 4 + 1] ^= as_uint4((uchar16)(r0.w, r1.w, r2.w, r3.w, r4.w, r5.w, r6.w, r7.w, r8.w, r9.w, rA.w, rB.w, rC.w, rD.w, rE.w, rF.w));
}
src += BLK_SIZE / 4;
}
}
__kernel void method16(
__global uint *src, __global uint *src,
__global uint *dst, __global uint *dst,
__global ushort *factors, __global ushort *factors,
@@ -136,7 +296,7 @@ __kernel void method4(
{ {
__local int table[16]; __local int table[16];
__local uint cache[256]; __local uint cache[256];
int i, j, blk, pos, sht, mask; int i, j, blk, pos, mask, tmp;
uint sum; uint sum;
const int work_id = get_global_id(0); const int work_id = get_global_id(0);
const int work_size = get_global_size(0); const int work_size = get_global_size(0);
@@ -146,11 +306,12 @@ __kernel void method4(
for (blk = 0; blk < blk_num; blk++){ for (blk = 0; blk < blk_num; blk++){
if (get_local_id(0) == 0){ if (get_local_id(0) == 0){
pos = factors[blk] << 16; tmp = factors[blk];
table[0] = pos; table[0] = tmp;
for (j = 1; j < 16; j++){ for (j = 1; j < 16; j++){
pos = (pos << 1) ^ ((pos >> 31) & 0x100B0000); mask = (tmp & 0x8000) ? 0x1100B : 0;
table[j] = pos; tmp = (tmp << 1) ^ mask;
table[j] = tmp;
} }
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -161,10 +322,11 @@ __kernel void method4(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
sum = 0; sum = 0;
sht = (i & 60) >> 2; tmp = (i & 60) >> 2;
tmp = 0x8000 >> tmp;
pos &= ~60; pos &= ~60;
for (j = 15; j >= 0; j--){ for (j = 15; j >= 0; j--){
mask = (table[j] << sht) >> 31; mask = (table[j] & tmp) ? 0xFFFFFFFF : 0;
sum ^= mask & cache[pos]; sum ^= mask & cache[pos];
pos += 4; pos += 4;
} }