From 1552fb8ec8a5d1fe8e698552aa8dae4569eab523 Mon Sep 17 00:00:00 2001 From: Yutaka Sawada <60930312+Yutaka-Sawada@users.noreply.github.com> Date: Tue, 26 Dec 2023 18:53:12 +0900 Subject: [PATCH] Add files via upload --- source/par2j/Command_par2j.txt | 24 +- source/par2j/create.c | 47 ++-- source/par2j/gf16.c | 2 +- source/par2j/gf16.h | 2 +- source/par2j/lib_opencl.c | 418 ++++++++++++++++++++++++--------- source/par2j/lib_opencl.h | 7 +- source/par2j/list.c | 35 +-- source/par2j/md5_crc.c | 161 +++++++------ source/par2j/par2_cmd.c | 10 +- source/par2j/reedsolomon.c | 33 +-- source/par2j/rs_decode.c | 194 +++++++++------ source/par2j/rs_encode.c | 223 +++++++++++------- source/par2j/source.cl | 194 +++++++++++++-- 13 files changed, 920 insertions(+), 430 deletions(-) diff --git a/source/par2j/Command_par2j.txt b/source/par2j/Command_par2j.txt index 22e761f..bb52812 100644 --- a/source/par2j/Command_par2j.txt +++ b/source/par2j/Command_par2j.txt @@ -1,4 +1,4 @@ -[ par2j.exe - version 1.3.3.1 or later ] +[ par2j.exe - version 1.3.3.2 or later ] Type "par2j.exe" to see version, test integrity, and show usage below. @@ -369,14 +369,22 @@ The format is "/lc#" (# is from 1 to 32 as the number of using threads). 0: It uses the number of physical Cores. 255: It tries to use more threads than number of physical Cores. - You may set additional combinations; -+1024 to disable CLMUL (and use slower SSSE3 code), -+2048 to disable JIT (for SSE2), -+4096 to disable SSSE3, -+8192 to disable AVX2, -+256 or +512 (slower device) to enable GPU acceleration. + You may set additional combinations for CPU feature; ++1024 to disable CLMUL (and use slower SSSE3 code) ++2048 to disable JIT (for SSE2) ++4096 to disable SSSE3 ++8192 to disable AVX2 - for example, /lc1 to use single Core, /lc45 to use half Cores and GPU + You may set additional combinations for GPU control; ++256 or +512 (slower device) to enable GPU acceleration ++65536 for classic method ++131072 for 16-byte memory access ++262144 for 4-byte memory access and calculate 2 blocks at once ++524288 for 16-byte memory access and calculate 2 blocks at once ++1048576 for CL_MEM_COPY_HOST_PTR or +2097152 for CL_MEM_USE_HOST_PTR +(When you set exclusive bits, larger value will be used.) + + for example, /lc1 to use single Core, /lc508 to use half Cores and GPU /m : Set this, if you want to set memory usage. diff --git a/source/par2j/create.c b/source/par2j/create.c index 113239b..54ebd88 100644 --- a/source/par2j/create.c +++ b/source/par2j/create.c @@ -1,5 +1,5 @@ // create.c -// Copyright : 2023-10-22 Yutaka Sawada +// Copyright : 2023-12-12 Yutaka Sawada // License : GPL #ifndef _UNICODE @@ -26,6 +26,11 @@ //#define TIMER // 実験用 +#ifdef TIMER +#include +static double time_sec, time_speed; +#endif + // ソート時に項目を比較する static int sort_cmp(const void *elem1, const void *elem2) { @@ -196,7 +201,7 @@ int set_common_packet( __int64 prog_now = 0; #ifdef TIMER -unsigned int time_start = GetTickCount(); +clock_t time_start = clock(); #endif print_progress_text(0, "Computing file hash"); @@ -305,14 +310,14 @@ unsigned int time_start = GetTickCount(); off += (64 + main_packet_size); #ifdef TIMER -time_start = GetTickCount() - time_start; -printf("hash %d.%03d sec", time_start / 1000, time_start % 1000); -if (time_start > 0){ - time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072)); - printf(", %d MB/s\n", time_start); +time_start = clock() - time_start; +time_sec = (double)time_start / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = (double)total_file_size / (time_sec * 1048576); } else { - printf("\n"); + time_speed = 0; } +printf("hash %.3f sec, %.0f MB/s\n", time_sec, time_speed); #endif error_end: @@ -341,7 +346,7 @@ int set_common_packet_multi( FILE_HASH_TH th[MAX_MULTI_READ]; #ifdef TIMER -unsigned int time_start = GetTickCount(); +clock_t time_start = clock(); #endif memset(hSub, 0, sizeof(HANDLE) * MAX_MULTI_READ); @@ -545,14 +550,14 @@ unsigned int time_start = GetTickCount(); } print_progress_done(); // 改行して行の先頭に戻しておく #ifdef TIMER -time_start = GetTickCount() - time_start; -printf("hash %d.%03d sec", time_start / 1000, time_start % 1000); -if (time_start > 0){ - time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072)); - printf(", %d MB/s\n", time_start); +time_start = clock() - time_start; +time_sec = (double)time_start / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = (double)total_file_size / (time_sec * 1048576); } else { - printf("\n"); + time_speed = 0; } +printf("hash %.3f sec, %.0f MB/s\n", time_sec, time_speed); #endif error_end: @@ -700,7 +705,7 @@ int set_common_packet_hash( __int64 prog_now = 0; #ifdef TIMER -unsigned int time_start = GetTickCount(); +clock_t time_start = clock(); #endif print_progress_text(0, "Computing file hash"); @@ -740,8 +745,8 @@ unsigned int time_start = GetTickCount(); print_progress_done(); // 改行して行の先頭に戻しておく #ifdef TIMER -time_start = GetTickCount() - time_start; -printf("hash %d.%03d sec\n", time_start / 1000, time_start % 1000); +time_start = clock() - time_start; +printf("hash %.3f sec\n", (double)time_start / CLOCKS_PER_SEC); #endif return 0; } @@ -1065,7 +1070,7 @@ int create_recovery_file( #endif #ifdef TIMER -unsigned int time_start = GetTickCount(); +clock_t time_start = clock(); #endif print_progress_text(0, "Constructing recovery file"); time_last = GetTickCount(); @@ -1258,8 +1263,8 @@ unsigned int time_start = GetTickCount(); print_progress_done(); // 改行して行の先頭に戻しておく #ifdef TIMER -time_start = GetTickCount() - time_start; -printf("write %d.%03d sec\n", time_start / 1000, time_start % 1000); +time_start = clock() - time_start; +printf("write %.3f sec\n", (double)time_start / CLOCKS_PER_SEC); #endif return 0; diff --git a/source/par2j/gf16.c b/source/par2j/gf16.c index 754794d..36ad81a 100644 --- a/source/par2j/gf16.c +++ b/source/par2j/gf16.c @@ -2795,7 +2795,7 @@ void galois_align_xor( void galois_align16_multiply( unsigned char *r1, // Region to multiply (must be aligned by 16) unsigned char *r2, // Products go here - unsigned int len, // Byte length (must be multiple of 32) + unsigned int len, // Byte length (must be multiple of 16) int factor) // Number to multiply by { if (factor <= 1){ diff --git a/source/par2j/gf16.h b/source/par2j/gf16.h index 52e95b8..d797f7e 100644 --- a/source/par2j/gf16.h +++ b/source/par2j/gf16.h @@ -6,7 +6,7 @@ extern "C" { #endif -extern unsigned short *galois_log_table; +//extern unsigned short *galois_log_table; extern unsigned int cpu_flag; int galois_create_table(void); // Returns 0 on success, -1 on failure diff --git a/source/par2j/lib_opencl.c b/source/par2j/lib_opencl.c index ea4411a..455cd17 100644 --- a/source/par2j/lib_opencl.c +++ b/source/par2j/lib_opencl.c @@ -1,5 +1,5 @@ // lib_opencl.c -// Copyright : 2023-11-27 Yutaka Sawada +// Copyright : 2023-12-26 Yutaka Sawada // License : GPL #ifndef _WIN32_WINNT @@ -84,7 +84,7 @@ cl_command_queue OpenCL_command = NULL; cl_kernel OpenCL_kernel = NULL; cl_mem OpenCL_src = NULL, OpenCL_dst = NULL, OpenCL_buf = NULL; size_t OpenCL_group_num; -int OpenCL_method = 0; // 正=速い機器を選ぶ, 負=遅い機器を選ぶ +int OpenCL_method = 0; // 標準では GPU を使わず、動作は自動選択される API_clCreateBuffer gfn_clCreateBuffer; API_clReleaseMemObject gfn_clReleaseMemObject; @@ -100,7 +100,11 @@ API_clEnqueueNDRangeKernel gfn_clEnqueueNDRangeKernel; /* 入力 -OpenCL_method : どのデバイスを選ぶか +OpenCL_method : どのデバイスや関数を選ぶか + 0x100 = 速い機器を選ぶ, 0x200 = 遅い機器を選ぶ + 0x10000 = 1ブロックずつ計算する, 0x20000 = 2ブロックずつ計算しようとする + 0x40000 = 4-byte memory access, 0x80000 = try 16-byte memory access + 0x100000 = CL_MEM_COPY_HOST_PTR, 0x200000 = CL_MEM_USE_HOST_PTR unit_size : ブロックの単位サイズ src_max : ソース・ブロック個数 @@ -111,11 +115,12 @@ OpenCL_method : 動作フラグいろいろ */ // 0=成功, 1~エラー番号 -int init_OpenCL(int unit_size, int *src_max) +int init_OpenCL(unsigned int unit_size, int *src_max) { char buf[2048], *p_source; int err = 0, i, j; - int gpu_power, count, gpu_flag; + int gpu_power, count; + int unified_memory; // non zero = Integrated GPU size_t data_size, alloc_max; //FILE *fp; HRSRC res; @@ -136,9 +141,10 @@ int init_OpenCL(int unit_size, int *src_max) API_clReleaseProgram fn_clReleaseProgram; API_clCreateKernel fn_clCreateKernel; API_clGetKernelWorkGroupInfo fn_clGetKernelWorkGroupInfo; + API_clReleaseKernel fn_clReleaseKernel; cl_int ret; cl_uint num_platforms = 0, num_devices = 0, num_groups, param_value; - cl_ulong param_value8, param_value4; + cl_ulong param_value8; cl_platform_id platform_id[MAX_DEVICE], selected_platform; // Intel, AMD, Nvidia などドライバーの提供元 cl_device_id device_id[MAX_DEVICE], selected_device; // CPU や GPU など cl_program program; @@ -215,6 +221,9 @@ int init_OpenCL(int unit_size, int *src_max) fn_clGetKernelWorkGroupInfo = (API_clGetKernelWorkGroupInfo)GetProcAddress(hLibOpenCL, "clGetKernelWorkGroupInfo"); if (fn_clGetKernelWorkGroupInfo == NULL) return err; + fn_clReleaseKernel = (API_clReleaseKernel)GetProcAddress(hLibOpenCL, "clReleaseKernel"); + if (fn_clReleaseKernel == NULL) + return err; gfn_clFinish = (API_clFinish)GetProcAddress(hLibOpenCL, "clFinish"); if (gfn_clFinish == NULL) return err; @@ -226,12 +235,10 @@ int init_OpenCL(int unit_size, int *src_max) ret = fn_clGetPlatformIDs(MAX_DEVICE, platform_id, &num_platforms); if (ret != CL_SUCCESS) return (ret << 8) | 10; - if (OpenCL_method >= 0){ // 選択する順序と初期値を変える - OpenCL_method = 1; - gpu_power = 0; - } else { - OpenCL_method = -1; + if (OpenCL_method & 0x200){ // 選択する順序と初期値を変える gpu_power = INT_MIN; + } else { + gpu_power = 0; } alloc_max = 0; @@ -268,20 +275,17 @@ int init_OpenCL(int unit_size, int *src_max) #endif // 取得できなくてもエラーにしない - param_value = 0; // CL_DEVICE_HOST_UNIFIED_MEMORY は OpenCL 2.0 以降で非推奨になった ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_uint), ¶m_value, NULL); + if (ret == CL_SUCCESS){ + if (param_value != 0){ #ifdef DEBUG_OUTPUT - if (ret == CL_SUCCESS) - printf("HOST_UNIFIED_MEMORY = %d\n", param_value); -#endif - if (param_value != 0) - param_value = 1; - param_value4 = 0; // local memory が多い時だけ処理を変える - ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m_value4, NULL); -#ifdef DEBUG_OUTPUT - if (ret == CL_SUCCESS) - printf("LOCAL_MEM_SIZE = %I64d KB\n", param_value4 >> 10); + printf("HOST_UNIFIED_MEMORY = %d\n", param_value); #endif + param_value = 1; + } + } else { // CL_DEVICE_HOST_UNIFIED_MEMORY は OpenCL 2.0 以降で非推奨になった + param_value = 0; + } // 取得できない場合はエラーにする ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), ¶m_value8, NULL); @@ -302,7 +306,8 @@ int init_OpenCL(int unit_size, int *src_max) #endif // MAX_COMPUTE_UNITS * MAX_WORK_GROUP_SIZE で計算力を測る、外付けGPUなら値を倍にする count = (2 - param_value) * (int)data_size * num_groups; - count *= OpenCL_method; // 符号を変える + if (OpenCL_method & 0x200) // Prefer slower device + count *= -1; // 符号を変える //printf("prev = %d, now = %d\n", gpu_power, count); if ((count > gpu_power) && (data_size >= 256) && // 256以上ないとテーブルを作れない (param_value8 / 8 > (cl_ulong)unit_size)){ // CL_DEVICE_MAX_MEM_ALLOC_SIZE に収まるか @@ -311,9 +316,7 @@ int init_OpenCL(int unit_size, int *src_max) selected_platform = platform_id[i]; OpenCL_group_num = num_groups; // ワークグループ数は COMPUTE_UNITS 数にする alloc_max = (size_t)param_value8; - gpu_flag = param_value; // 0 = discrete GPU, 1 = integrated GPU - if (param_value4 >= 32768) - gpu_flag |= 2; // local memory が 32KB 以上あるかどうか + unified_memory = param_value; // 0 = discrete GPU, 1 = integrated GPU // AMD や Intel の GPU ではメモリー領域が全体の 1/4 とは限らない ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), ¶m_value8, NULL); @@ -355,49 +358,6 @@ int init_OpenCL(int unit_size, int *src_max) if (ret != CL_SUCCESS) return (ret << 8) | 12; - // 計算方式を選択する - if ((((cpu_flag & 0x101) == 1) || ((cpu_flag & 0x110) == 0x10)) && (sse_unit == 32)){ - if (gpu_flag & 2){ - OpenCL_method = 3; // local memory が 32KB 以上あれば 16-byte ずつアクセスする - } else { - OpenCL_method = 2; // SSSE3 & ALTMAP または AVX2 ならデータの並び替え対応版を使う - } - } else if (((cpu_flag & 128) != 0) && (sse_unit == 256)){ - OpenCL_method = 4; // JIT(SSE2) は bit ごとに上位から 16バイトずつ並ぶ - // ローカルのテーブルサイズが異なることに注意 - // XOR 方式以外は 2KB (4バイト * 256項目 * 2個) 使う - // XOR (JIT) は 64バイト (4バイト * 16項目) 使う - } else { - OpenCL_method = 1; // 並び替えられてないデータ用 - } - - // work group 数が必要以上に多い場合は減らす - if (OpenCL_method == 2){ - // work item 一個が 8バイトずつ計算する、256個なら work group ごとに 2KB 担当する - data_size = unit_size / 2048; - } else if (OpenCL_method == 3){ - // work item 一個が 32バイトずつ計算する、256個なら work group ごとに 8KB 担当する - data_size = unit_size / 8192; - } else { - // work item 一個が 4バイトずつ計算する、256個なら work group ごとに 1KB 担当する - data_size = unit_size / 1024; - } - if (OpenCL_group_num > data_size){ - OpenCL_group_num = data_size; - printf("Number of work groups is reduced to %zd\n", OpenCL_group_num); - } - - // データへのアクセス方法をデバイスによって変える - if (gpu_flag & 1){ - OpenCL_method |= 8; // Integrated GPU なら CL_MEM_USE_HOST_PTR を使う - } else { // Discrete GPU なら NVIDIA のだけ flag を変える - ret = fn_clGetDeviceInfo(selected_device, CL_DEVICE_VERSION, sizeof(buf), buf, NULL); - if (ret == CL_SUCCESS){ - if (strstr(buf, "CUDA") != NULL) - OpenCL_method |= 8; // NVIDIA GPU なら CL_MEM_USE_HOST_PTR を使う - } - } - // 最大で何ブロック分のメモリー領域を保持できるのか(ここではまだ確保しない) // 後で実際に確保する量はこれよりも少なくなる count = (int)(alloc_max / unit_size); // 確保できるメモリー量から逆算する @@ -409,25 +369,6 @@ int init_OpenCL(int unit_size, int *src_max) printf("src buf : %zd KB (%d blocks), possible\n", data_size >> 10, count); #endif - // 出力先は1ブロック分だけあればいい - // CL_MEM_ALLOC_HOST_PTRを使えばpinned memoryになるらしい - data_size = unit_size; - OpenCL_dst = gfn_clCreateBuffer(OpenCL_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, data_size, NULL, &ret); - if (ret != CL_SUCCESS) - return (ret << 8) | 13; -#ifdef DEBUG_OUTPUT - printf("dst buf : %zd KB (%zd Bytes), OK\n", data_size >> 10, data_size); -#endif - - // factor は最大個数分 (src_max個) - data_size = sizeof(unsigned short) * (*src_max); - OpenCL_buf = gfn_clCreateBuffer(OpenCL_context, CL_MEM_READ_ONLY, data_size, NULL, &ret); - if (ret != CL_SUCCESS) - return (ret << 8) | 14; -#ifdef DEBUG_OUTPUT - printf("factor buf : %zd Bytes (%d factors), OK\n", data_size, (*src_max)); -#endif - /* // テキスト形式の OpenCL C ソース・コードを読み込む err = 4; @@ -528,18 +469,208 @@ int init_OpenCL(int unit_size, int *src_max) return (ret << 8) | 21; } - // カーネル関数を抽出する - wsprintfA(buf, "method%d", OpenCL_method & 7); - OpenCL_kernel = fn_clCreateKernel(program, buf, &ret); - if (ret != CL_SUCCESS) - return (ret << 8) | 22; + // 計算方式を選択する + if ((((cpu_flag & 0x101) == 1) || ((cpu_flag & 0x110) == 0x10)) && (sse_unit == 32)){ + int select_method; // SSSE3 & ALTMAP または AVX2 ならデータの並び替え対応版を使う + if (OpenCL_method & 0x80000){ // 16-byte and 2 blocks + select_method = 12; + } else if (OpenCL_method & 0x40000){ // 4-byte and 2 blocks + select_method = 10; + } else if (OpenCL_method & 0x20000){ // 16-byte + select_method = 4; + } else if (OpenCL_method & 0x10000){ // 4-byte + select_method = 2; + } else { // kernel を作って詳細を確かめる + size_t item2, item4, item10, item12; + cl_kernel kernel2, kernel4, kernel10, kernel12; + item2 = item4 = item10 = item12 = 0; + // まずは一番重くて速い奴を調べる + wsprintfA(buf, "method%d", 12); + kernel12 = fn_clCreateKernel(program, buf, &ret); + if (ret == CL_SUCCESS){ + ret = fn_clGetKernelWorkGroupInfo(kernel12, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item12, NULL); + if (ret == CL_SUCCESS){ #ifdef DEBUG_OUTPUT - printf("CreateKernel : %s\n", buf); + printf("\nTesting %s\n", buf); + printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item12); #endif + } + } + if (item12 >= 32){ // 32以上あれば余裕で動くとみなす + select_method = 12; + OpenCL_kernel = kernel12; +#ifdef DEBUG_OUTPUT + printf("\nSelected method%d\n", select_method); +#endif + } else { // 他の奴と比較する + wsprintfA(buf, "method%d", 2); + kernel2 = fn_clCreateKernel(program, buf, &ret); + if (ret == CL_SUCCESS){ + ret = fn_clGetKernelWorkGroupInfo(kernel2, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item2, NULL); + if (ret == CL_SUCCESS){ +#ifdef DEBUG_OUTPUT + printf("\nTesting %s\n", buf); + printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item2); +#endif + } + } + if (item12 >= item2){ + select_method = 12; + OpenCL_kernel = kernel12; + ret = fn_clReleaseKernel(kernel2); +#ifdef DEBUG_OUTPUT + if (ret != CL_SUCCESS) + printf("clReleaseKernel : Failed\n"); + printf("\nSelected method%d\n", select_method); +#endif + } else { + ret = fn_clReleaseKernel(kernel12); +#ifdef DEBUG_OUTPUT + if (ret != CL_SUCCESS) + printf("clReleaseKernel : Failed\n"); +#endif + wsprintfA(buf, "method%d", 10); + kernel10 = fn_clCreateKernel(program, buf, &ret); + if (ret == CL_SUCCESS){ + ret = fn_clGetKernelWorkGroupInfo(kernel10, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item10, NULL); + if (ret == CL_SUCCESS){ +#ifdef DEBUG_OUTPUT + printf("\nTesting %s\n", buf); + printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item10); +#endif + } + } + if (item10 >= item2){ + select_method = 10; + OpenCL_kernel = kernel10; + ret = fn_clReleaseKernel(kernel2); +#ifdef DEBUG_OUTPUT + if (ret != CL_SUCCESS) + printf("clReleaseKernel : Failed\n"); + printf("\nSelected method%d\n", select_method); +#endif + } else { + wsprintfA(buf, "method%d", 4); + kernel4 = fn_clCreateKernel(program, buf, &ret); + if (ret == CL_SUCCESS){ + ret = fn_clGetKernelWorkGroupInfo(kernel4, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item4, NULL); + if (ret == CL_SUCCESS){ +#ifdef DEBUG_OUTPUT + printf("\nTesting %s\n", buf); + printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item4); +#endif + } + } + if (item4 >= item2){ + select_method = 4; + OpenCL_kernel = kernel4; + ret = fn_clReleaseKernel(kernel2); +#ifdef DEBUG_OUTPUT + if (ret != CL_SUCCESS) + printf("clReleaseKernel : Failed\n"); + printf("\nSelected method%d\n", select_method); +#endif + } else { + select_method = 2; + OpenCL_kernel = kernel2; + ret = fn_clReleaseKernel(kernel4); +#ifdef DEBUG_OUTPUT + if (ret != CL_SUCCESS) + printf("clReleaseKernel : Failed\n"); + printf("\nSelected method%d\n", select_method); +#endif + } + } + } + } + } + OpenCL_method |= select_method; + } else if (((cpu_flag & 128) != 0) && (sse_unit == 256)){ + OpenCL_method |= 16; // JIT(SSE2) は bit ごとに上位から 16バイトずつ並ぶ + // ローカルのテーブルサイズが異なることに注意 + // XOR 方式以外は 2KB (4バイト * 256項目 * 2個) 使う + // XOR (JIT) は 64バイト (4バイト * 16項目) 使う + } else { + int select_method; // 並び替えられてないデータ用 + if (OpenCL_method & 0x40000){ // 4-byte and 2 blocks + select_method = 9; + } else if (OpenCL_method & 0x10000){ // 4-byte + select_method = 1; + } else { // kernel を作って詳細を確かめる + size_t item1, item9; + cl_kernel kernel1, kernel9; + item1 = item9 = 0; + // まずは一番重くて速い奴を調べる + wsprintfA(buf, "method%d", 9); + kernel9 = fn_clCreateKernel(program, buf, &ret); + if (ret == CL_SUCCESS){ + ret = fn_clGetKernelWorkGroupInfo(kernel9, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item9, NULL); + if (ret == CL_SUCCESS){ +#ifdef DEBUG_OUTPUT + printf("\nTesting %s\n", buf); + printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item9); +#endif + } + } + if (item9 >= 32){ // 32以上あれば余裕で動くとみなす + select_method = 9; + OpenCL_kernel = kernel9; +#ifdef DEBUG_OUTPUT + printf("\nSelected method%d\n", select_method); +#endif + } else { // 他の奴と比較する + wsprintfA(buf, "method%d", 1); + kernel1 = fn_clCreateKernel(program, buf, &ret); + if (ret == CL_SUCCESS){ + ret = fn_clGetKernelWorkGroupInfo(kernel1, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item1, NULL); + if (ret == CL_SUCCESS){ +#ifdef DEBUG_OUTPUT + printf("\nTesting %s\n", buf); + printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item1); +#endif + } + } + if (item9 >= item1){ + select_method = 9; + OpenCL_kernel = kernel9; + ret = fn_clReleaseKernel(kernel1); +#ifdef DEBUG_OUTPUT + if (ret != CL_SUCCESS) + printf("clReleaseKernel : Failed\n"); + printf("\nSelected method%d\n", select_method); +#endif + } else { + select_method = 1; + OpenCL_kernel = kernel1; + ret = fn_clReleaseKernel(kernel9); +#ifdef DEBUG_OUTPUT + if (ret != CL_SUCCESS) + printf("clReleaseKernel : Failed\n"); + printf("\nSelected method%d\n", select_method); +#endif + } + } + } + OpenCL_method |= select_method; + } + + // カーネル関数を抽出する + if (OpenCL_kernel == NULL){ + wsprintfA(buf, "method%d", OpenCL_method & 31); + OpenCL_kernel = fn_clCreateKernel(program, buf, &ret); + if (ret != CL_SUCCESS) + return (ret << 8) | 22; +#ifdef DEBUG_OUTPUT + printf("CreateKernel : %s\n", buf); + ret = fn_clGetKernelWorkGroupInfo(OpenCL_kernel, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &data_size, NULL); + if (ret == CL_SUCCESS) + printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", data_size); +#endif + } // カーネルが実行できる work item 数を調べる - ret = fn_clGetKernelWorkGroupInfo(OpenCL_kernel, NULL, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &data_size, NULL); - if ((ret == CL_SUCCESS) && (data_size < 256)){ // 最低でも 256以上は必要 + ret = fn_clGetKernelWorkGroupInfo(OpenCL_kernel, selected_device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &data_size, NULL); + if ((ret == CL_SUCCESS) && (data_size < 256)){ // 最低でも 256 以上は必要 #ifdef DEBUG_OUTPUT printf("KERNEL_WORK_GROUP_SIZE = %zd\n", data_size); #endif @@ -558,6 +689,60 @@ int init_OpenCL(int unit_size, int *src_max) fn_clUnloadCompiler(); } + // work group 数が必要以上に多い場合は減らす + if (OpenCL_method & 4){ + // work item 一個が 32バイトずつ計算する、256個なら work group ごとに 8KB 担当する + data_size = unit_size / 8192; + } else if (OpenCL_method & 2){ + // work item 一個が 8バイトずつ計算する、256個なら work group ごとに 2KB 担当する + data_size = unit_size / 2048; + } else { + // work item 一個が 4バイトずつ計算する、256個なら work group ごとに 1KB 担当する + data_size = unit_size / 1024; + } + if (OpenCL_group_num > data_size){ + OpenCL_group_num = data_size; + printf("Number of work groups is reduced to %zd\n", OpenCL_group_num); + } + + // データへのアクセス方法をデバイスによって変える + if (OpenCL_method & 0x200000){ + OpenCL_method |= 32; + } else if ((OpenCL_method & 0x100000) == 0){ + if (unified_memory){ + OpenCL_method |= 32; // Integrated GPU なら CL_MEM_USE_HOST_PTR を使う + } else { // Discrete GPU でも Nvidia のは動作を変える + ret = fn_clGetDeviceInfo(selected_device, CL_DEVICE_VERSION, sizeof(buf), buf, NULL); + if (ret == CL_SUCCESS){ + if (strstr(buf, "CUDA") != NULL) + OpenCL_method |= 32; // NVIDIA GPU なら CL_MEM_USE_HOST_PTR を使う + } + } + } + + // 出力先は1ブロック分だけあればいい + // CL_MEM_ALLOC_HOST_PTRを使えばpinned memoryになるらしい + data_size = unit_size; + if (OpenCL_method & 8) + data_size *= 2; // 2ブロックずつ計算できるように、2倍確保しておく + OpenCL_dst = gfn_clCreateBuffer(OpenCL_context, CL_MEM_ALLOC_HOST_PTR, data_size, NULL, &ret); + if (ret != CL_SUCCESS) + return (ret << 8) | 13; +#ifdef DEBUG_OUTPUT + printf("dst buf : %zd KB (%zd Bytes), OK\n", data_size >> 10, data_size); +#endif + + // factor は最大個数分 (src_max個) + data_size = sizeof(unsigned short) * (*src_max); + if (OpenCL_method & 8) + data_size *= 2; // 2ブロックずつ計算できるように、2倍確保しておく + OpenCL_buf = gfn_clCreateBuffer(OpenCL_context, CL_MEM_READ_ONLY, data_size, NULL, &ret); + if (ret != CL_SUCCESS) + return (ret << 8) | 14; +#ifdef DEBUG_OUTPUT + printf("factor buf : %zd Bytes (%d factors), OK\n", data_size, (*src_max)); +#endif + // カーネル引数を指定する ret = gfn_clSetKernelArg(OpenCL_kernel, 1, sizeof(cl_mem), &OpenCL_dst); if (ret != CL_SUCCESS) @@ -565,13 +750,12 @@ int init_OpenCL(int unit_size, int *src_max) ret = gfn_clSetKernelArg(OpenCL_kernel, 2, sizeof(cl_mem), &OpenCL_buf); if (ret != CL_SUCCESS) return (ret << 8) | 102; - if (ret != CL_SUCCESS) - return (ret << 8) | 103; #ifdef DEBUG_OUTPUT // ワークアイテム数 printf("\nMax number of work items = %zd (256 * %zd)\n", OpenCL_group_num * 256, OpenCL_group_num); #endif + OpenCL_method &= 0xFF; // 最後に選択設定を消去する return 0; } @@ -683,7 +867,7 @@ void info_OpenCL(char *buf, int buf_size) // ソース・ブロックをデバイス側にコピーする int gpu_copy_blocks( unsigned char *data, // ブロックのバッファー (境界は 4096にすること) - int unit_size, // 4096の倍数にすること + unsigned int unit_size, // 4096の倍数にすること int src_num) // 何ブロックをコピーするのか { size_t data_size; @@ -692,7 +876,7 @@ int gpu_copy_blocks( // Integrated GPU と Discrete GPU の違いに関係なく、使う分だけ毎回メモリー領域を確保する data_size = (size_t)unit_size * src_num; - if (OpenCL_method & 8){ // AMD's APU や Integrated GPU なら ZeroCopy する + if (OpenCL_method & 32){ // AMD's APU や Integrated GPU なら ZeroCopy する // 実際に比較してみると GeForce GPU でもメモリー消費量が少なくてコピーが速い // NVIDIA GPU は CL_MEM_USE_HOST_PTR でも VRAM 上にキャッシュするので速いらしい flags = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR; @@ -719,17 +903,31 @@ int gpu_copy_blocks( int gpu_multiply_blocks( int src_num, // Number of multiplying source blocks unsigned short *mat, // Matrix of numbers to multiply by + unsigned short *mat2, // Set to calculate 2 blocks at once unsigned char *buf, // Products go here - int len) // Byte length + unsigned int len) // Byte length { unsigned __int64 *vram, *src, *dst; size_t global_size, local_size; cl_int ret; // 倍率の配列をデバイス側に書き込む - ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, sizeof(short) * src_num, mat, 0, NULL, NULL); + if (mat2 == NULL){ // 1ブロック分だけコピーする + ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, sizeof(short) * src_num, mat, 0, NULL, NULL); + } else { // 2ブロックずつ計算する場合は、配列のサイズも2倍になる + if ((size_t)mat2 == 1){ // アドレスが 1 になることはあり得ないので、識別できる + ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, sizeof(short) * src_num * 2, mat, 0, NULL, NULL); + } else { // 2回コピーする + size_t data_size = sizeof(short) * src_num; + ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, data_size, mat, 0, NULL, NULL); + if (ret != CL_SUCCESS) + return (ret << 8) | 10; + // もう一つの配列は違う場所からコピーする + ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, data_size, data_size, mat2, 0, NULL, NULL); + } + } if (ret != CL_SUCCESS) - return (ret << 8) | 10; + return (ret << 8) | 11; // 引数を指定する ret = gfn_clSetKernelArg(OpenCL_kernel, 3, sizeof(int), &src_num); @@ -737,17 +935,17 @@ int gpu_multiply_blocks( return (ret << 8) | 103; // カーネル並列実行 - local_size = 256; // テーブルやキャッシュのため、work item 数は 256に固定する + local_size = 256; // テーブルやキャッシュのため、work item 数は 256 に固定する global_size = OpenCL_group_num * 256; - //printf("group num = %d, global size = %d, local size = 256 \n", OpenCL_group_num, global_size); + //printf("group num = %d, global size = %d, local size = %d \n", OpenCL_group_num, global_size, local_size); ret = gfn_clEnqueueNDRangeKernel(OpenCL_command, OpenCL_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if (ret != CL_SUCCESS) - return (ret << 8) | 11; + return (ret << 8) | 12; // 出力内容をホスト側に反映させる vram = gfn_clEnqueueMapBuffer(OpenCL_command, OpenCL_dst, CL_TRUE, CL_MAP_READ, 0, len, 0, NULL, NULL, &ret); if (ret != CL_SUCCESS) - return (ret << 8) | 12; + return (ret << 8) | 13; // 8バイトごとに XOR する (SSE2 で XOR しても速くならず) src = vram; @@ -762,7 +960,7 @@ int gpu_multiply_blocks( // ホスト側でデータを変更しなくても、clEnqueueMapBufferと対で呼び出さないといけない ret = gfn_clEnqueueUnmapMemObject(OpenCL_command, OpenCL_dst, vram, 0, NULL, NULL); if (ret != CL_SUCCESS) - return (ret << 8) | 13; + return (ret << 8) | 14; return 0; } @@ -775,12 +973,12 @@ int gpu_finish(void) // 全ての処理が終わるのを待つ ret = gfn_clFinish(OpenCL_command); if (ret != CL_SUCCESS) - return (ret << 8) | 20; + return (ret << 8) | 30; if (OpenCL_src != NULL){ // 確保されてる場合は解除する ret = gfn_clReleaseMemObject(OpenCL_src); if (ret != CL_SUCCESS) - return (ret << 8) | 21; + return (ret << 8) | 31; OpenCL_src = NULL; } diff --git a/source/par2j/lib_opencl.h b/source/par2j/lib_opencl.h index ddde655..6da24b8 100644 --- a/source/par2j/lib_opencl.h +++ b/source/par2j/lib_opencl.h @@ -10,20 +10,21 @@ extern "C" { extern int OpenCL_method; -int init_OpenCL(int unit_size, int *src_max); +int init_OpenCL(unsigned int unit_size, int *src_max); int free_OpenCL(void); void info_OpenCL(char *buf, int buf_size); int gpu_copy_blocks( unsigned char *data, - int unit_size, + unsigned int unit_size, int src_num); int gpu_multiply_blocks( int src_num, // Number of multiplying source blocks unsigned short *mat, // Matrix of numbers to multiply by + unsigned short *mat2, // Set to calculate 2 blocks at once unsigned char *buf, // Products go here - int len); // Byte length + unsigned int len); // Byte length int gpu_finish(void); diff --git a/source/par2j/list.c b/source/par2j/list.c index cf826c7..7551ecc 100644 --- a/source/par2j/list.c +++ b/source/par2j/list.c @@ -1,5 +1,5 @@ // list.c -// Copyright : 2023-10-15 Yutaka Sawada +// Copyright : 2023-12-12 Yutaka Sawada // License : GPL #ifndef _UNICODE @@ -26,6 +26,11 @@ //#define TIMER // 実験用 +#ifdef TIMER +#include +static double time_sec, time_speed; +#endif + // recovery set のファイルのハッシュ値を調べる (空のファイルは除く) // 0x00 = ファイルが存在して完全である // 0x01 = ファイルが存在しない @@ -296,7 +301,7 @@ int check_file_complete( { int i, rv; #ifdef TIMER -unsigned int time_start = GetTickCount(); +clock_t time_start = clock(); #endif printf("\nVerifying Input File :\n"); @@ -332,14 +337,14 @@ unsigned int time_start = GetTickCount(); } #ifdef TIMER -time_start = GetTickCount() - time_start; -printf("\n hash %d.%03d sec", time_start / 1000, time_start % 1000); -if (time_start > 0){ - time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072)); - printf(", %d MB/s\n", time_start); +time_start = clock() - time_start; +time_sec = (double)time_start / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = (double)total_file_size / (time_sec * 1048576); } else { - printf("\n"); + time_speed = 0; } +printf("\n hash %.3f sec, %.0f MB/s\n", time_sec, time_speed); #endif return 0; } @@ -364,7 +369,7 @@ int check_file_complete_multi( HANDLE hSub[MAX_READ_NUM]; FILE_CHECK_TH th[MAX_READ_NUM]; #ifdef TIMER -unsigned int time_start = GetTickCount(); +clock_t time_start = clock(); #endif memset(hSub, 0, sizeof(HANDLE) * MAX_READ_NUM); @@ -630,14 +635,14 @@ unsigned int time_start = GetTickCount(); } #ifdef TIMER -time_start = GetTickCount() - time_start; -printf("\n hash %d.%03d sec", time_start / 1000, time_start % 1000); -if (time_start > 0){ - time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072)); - printf(", %d MB/s\n", time_start); +time_start = clock() - time_start; +time_sec = (double)time_start / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = (double)total_file_size / (time_sec * 1048576); } else { - printf("\n"); + time_speed = 0; } +printf("\n hash %.3f sec, %.0f MB/s\n", time_sec, time_speed); #endif error_end: diff --git a/source/par2j/md5_crc.c b/source/par2j/md5_crc.c index ea7e729..f1940ee 100644 --- a/source/par2j/md5_crc.c +++ b/source/par2j/md5_crc.c @@ -1,5 +1,5 @@ // md5_crc.c -// Copyright : 2023-10-29 Yutaka Sawada +// Copyright : 2023-12-12 Yutaka Sawada // License : GPL #ifndef _UNICODE @@ -21,7 +21,6 @@ #include "phmd5.h" #include "md5_crc.h" - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // バイト配列の MD5 ハッシュ値を求める @@ -200,8 +199,10 @@ int file_md5_crc32_block( //#define TIMER // 実験用 #ifdef TIMER -static unsigned int time_start, time1_start; -static unsigned int time_total = 0, time2_total = 0, time3_total = 0; +#include +static double time_sec, time_speed; +static clock_t time_start, time1_start; +static clock_t time_total = 0, time2_total = 0, time3_total = 0; #endif #define MAX_BUF_SIZE 2097152 // ヒープ領域を使う場合の最大サイズ @@ -224,7 +225,7 @@ int file_hash_crc( HANDLE hFile; OVERLAPPED ol; #ifdef TIMER -time1_start = GetTickCount(); +time1_start = clock(); #endif // ソース・ファイルを開く @@ -251,11 +252,11 @@ time1_start = GetTickCount(); if (file_left < IO_SIZE) read_size = (unsigned int)file_left; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf1, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -281,11 +282,11 @@ time2_total += GetTickCount() - time_start; ol.OffsetHigh = (unsigned int)(file_off >> 32); file_off += IO_SIZE; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -301,7 +302,7 @@ time2_total += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = 0; // チェックサム計算 if (block_left > 0){ // 前回足りなかった分を追加する @@ -338,7 +339,7 @@ time_start = GetTickCount(); } } #ifdef TIMER -time3_total += GetTickCount() - time_start; +time3_total += clock() - time_start; #endif // 経過表示 @@ -369,16 +370,17 @@ error_end: CloseHandle(ol.hEvent); #ifdef TIMER -time_total += GetTickCount() - time1_start; +time_total += clock() - time1_start; if (*prog_now == total_file_size){ - printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); - printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); - if (time_total > 0){ - time_start = (int)((total_file_size * 125) / ((__int64)time_total * 131072)); + printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC); + printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC); + time_sec = (double)time_total / CLOCKS_PER_SEC; + if (time_sec > 0){ + time_speed = (double)total_file_size / (time_sec * 1048576); } else { - time_start = 0; + time_speed = 0; } - printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start); + printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed); } #endif return err; @@ -403,7 +405,7 @@ int file_hash_crc( HANDLE hFile; OVERLAPPED ol; #ifdef TIMER -time1_start = GetTickCount(); +time1_start = clock(); #endif // ソース・ファイルを開く @@ -442,11 +444,11 @@ error_retry_read: if (file_left < IO_SIZE) read_size = (unsigned int)file_left; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf1, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -536,11 +538,11 @@ error_retry_pause: ol.OffsetHigh = (unsigned int)(file_off >> 32); file_off += IO_SIZE; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -557,7 +559,7 @@ time2_total += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = 0; // チェックサム計算 if (block_left > 0){ // 前回足りなかった分を追加する @@ -594,7 +596,7 @@ time_start = GetTickCount(); } } #ifdef TIMER -time3_total += GetTickCount() - time_start; +time3_total += clock() - time_start; #endif // 経過表示 @@ -625,16 +627,17 @@ error_end: CloseHandle(ol.hEvent); #ifdef TIMER -time_total += GetTickCount() - time1_start; +time_total += clock() - time1_start; if (*prog_now == total_file_size){ - printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); - printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); - if (time_total > 0){ - time_start = (int)((total_file_size * 125) / ((__int64)time_total * 131072)); + printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC); + printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC); + time_sec = (double)time_total / CLOCKS_PER_SEC; + if (time_sec > 0){ + time_speed = (double)total_file_size / (time_sec * 1048576); } else { - time_start = 0; + time_speed = 0; } - printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start); + printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed); } #endif return err; @@ -660,7 +663,7 @@ int file_hash_crc( HANDLE hFile; OVERLAPPED ol; #ifdef TIMER -time1_start = GetTickCount(); +time1_start = clock(); #endif // ソース・ファイルを開く @@ -699,11 +702,11 @@ time1_start = GetTickCount(); if (file_left < io_size) read_size = (unsigned int)file_left; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf1, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -729,11 +732,11 @@ time2_total += GetTickCount() - time_start; ol.OffsetHigh = (unsigned int)(file_off >> 32); file_off += io_size; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -749,7 +752,7 @@ time2_total += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = 0; // チェックサム計算 if (block_left > 0){ // 前回足りなかった分を追加する @@ -786,7 +789,7 @@ time_start = GetTickCount(); } } #ifdef TIMER -time3_total += GetTickCount() - time_start; +time3_total += clock() - time_start; #endif // 経過表示 @@ -819,16 +822,17 @@ error_end: _aligned_free(buf1); #ifdef TIMER -time_total += GetTickCount() - time1_start; +time_total += clock() - time1_start; if (*prog_now == total_file_size){ - printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); - printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); - if (time_total > 0){ - time_start = (int)((total_file_size * 125) / ((__int64)time_total * 131072)); + printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC); + printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC); + time_sec = (double)time_total / CLOCKS_PER_SEC; + if (time_sec > 0){ + time_speed = (double)total_file_size / (time_sec * 1048576); } else { - time_start = 0; + time_speed = 0; } - printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start); + printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed); } #endif return err; @@ -1038,7 +1042,7 @@ int file_hash_check( PHMD5 hash_ctx, block_ctx; OVERLAPPED ol; #ifdef TIMER -time1_start = GetTickCount(); +time1_start = clock(); #endif prog_last = -1; // 検証中のファイル名を毎回表示する @@ -1062,11 +1066,11 @@ time1_start = GetTickCount(); file_left = file_size - 16384; // 本来のファイル・サイズまでしか検査しない } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf, len, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -1141,11 +1145,11 @@ time2_total += GetTickCount() - time_start; if (file_left < IO_SIZE) read_size = (unsigned int)file_left; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf1, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -1168,11 +1172,11 @@ time2_total += GetTickCount() - time_start; ol.OffsetHigh = (unsigned int)(file_off >> 32); file_off += IO_SIZE; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -1187,7 +1191,7 @@ time2_total += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif if (s_blk != NULL){ off = 0; @@ -1230,7 +1234,7 @@ time_start = GetTickCount(); Phmd5Process(&hash_ctx, buf, len); // MD5 計算 } #ifdef TIMER -time3_total += GetTickCount() - time_start; +time3_total += clock() - time_start; #endif // 経過表示 @@ -1267,15 +1271,16 @@ error_end: CloseHandle(ol.hEvent); #ifdef TIMER -time_total += GetTickCount() - time1_start; - printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); - printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); - if (time_total > 0){ - time_start = (int)((file_size * 125) / ((__int64)time_total * 131072)); +time_total += clock() - time1_start; + printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC); + printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC); + time_sec = (double)time_total / CLOCKS_PER_SEC; + if (time_sec > 0){ + time_speed = (double)file_size / (time_sec * 1048576); } else { - time_start = 0; + time_speed = 0; } - printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start); + printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed); #endif return comp_num; } @@ -1536,7 +1541,7 @@ int file_hash_direct( HANDLE hFile; OVERLAPPED ol; #ifdef TIMER -time1_start = GetTickCount(); +time1_start = clock(); #endif prog_last = -1; // 検証中のファイル名を毎回表示する @@ -1592,11 +1597,11 @@ time1_start = GetTickCount(); file_left = file_size - 16384; // 本来のファイル・サイズまでしか検査しない } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ comp_num = -1; @@ -1679,11 +1684,11 @@ time2_total += GetTickCount() - time_start; read_size = (read_size + 4095) & ~4095; // 4KB の倍数にする } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf1, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -1710,11 +1715,11 @@ time2_total += GetTickCount() - time_start; ol.OffsetHigh = (unsigned int)(file_off >> 32); file_off += IO_SIZE; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif off = ReadFile(hFile, buf, read_size, NULL, &ol); #ifdef TIMER -time2_total += GetTickCount() - time_start; +time2_total += clock() - time_start; #endif if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){ print_win32_err(); @@ -1729,7 +1734,7 @@ time2_total += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif if (s_blk != NULL){ off = 0; @@ -1771,7 +1776,7 @@ time_start = GetTickCount(); Phmd5Process(&hash_ctx, buf, len); // MD5 計算 } #ifdef TIMER -time3_total += GetTickCount() - time_start; +time3_total += clock() - time_start; #endif // 経過表示 @@ -1812,10 +1817,16 @@ error_end: _aligned_free(buf1); #ifdef TIMER -time_total += GetTickCount() - time1_start; - printf("\nread %d.%03d sec\n", time2_total / 1000, time2_total % 1000); - printf("main %d.%03d sec\n", time3_total / 1000, time3_total % 1000); - printf("total %d.%03d sec\n", time_total / 1000, time_total % 1000); +time_total += clock() - time1_start; + printf("\nread %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC); + printf("main %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC); + time_sec = (double)time_total / CLOCKS_PER_SEC; + if (time_sec > 0){ + time_speed = (double)file_size / (time_sec * 1048576); + } else { + time_speed = 0; + } + printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed); #endif return comp_num; } diff --git a/source/par2j/par2_cmd.c b/source/par2j/par2_cmd.c index df433bc..37fdf07 100644 --- a/source/par2j/par2_cmd.c +++ b/source/par2j/par2_cmd.c @@ -1,5 +1,5 @@ // par2_cmd.c -// Copyright : 2023-10-15 Yutaka Sawada +// Copyright : 2023-12-09 Yutaka Sawada // License : GPL #ifndef _UNICODE @@ -1479,14 +1479,12 @@ ri= switch_set & 0x00040000 } else if (wcsncmp(tmp_p, L"lc", 2) == 0){ k = 0; j = 2; - while ((j < 2 + 5) && (tmp_p[j] >= '0') && (tmp_p[j] <= '9')){ + while ((j < 2 + 7) && (tmp_p[j] >= '0') && (tmp_p[j] <= '9')){ k = (k * 10) + (tmp_p[j] - '0'); j++; } - if (k & 256){ // GPU を使う - OpenCL_method = 1; // Faster GPU - } else if (k & 512){ - OpenCL_method = -1; // Slower GPU + if (k & 0x300){ // GPU を使う + OpenCL_method = k & 0x003F0300; } if (k & 1024) // CLMUL と ALTMAP を使わない cpu_flag = (cpu_flag & 0xFFFFFFF7) | 256; diff --git a/source/par2j/reedsolomon.c b/source/par2j/reedsolomon.c index 0a91b69..75b15be 100644 --- a/source/par2j/reedsolomon.c +++ b/source/par2j/reedsolomon.c @@ -1,5 +1,5 @@ // reedsolomon.c -// Copyright : 2023-10-26 Yutaka Sawada +// Copyright : 2023-12-12 Yutaka Sawada // License : GPL #ifndef _UNICODE @@ -27,6 +27,9 @@ #include "rs_decode.h" #include "reedsolomon.h" +#ifdef TIMER +#include +#endif // GPU を使う最小データサイズ (MB 単位) // GPU の起動には時間がかかるので、データが小さすぎると逆に遅くなる @@ -739,7 +742,7 @@ int rs_encode( int err = 0; unsigned int len; #ifdef TIMER -unsigned int time_total = GetTickCount(); +clock_t time_total = clock(); #endif if (galois_create_table()){ @@ -755,7 +758,7 @@ unsigned int time_total = GetTickCount(); // パリティ計算用の行列演算の準備をする len = sizeof(unsigned short) * source_num; if (OpenCL_method != 0) - len *= 2; // GPU の作業領域も確保しておく + len *= 3; // GPU の作業領域も確保しておく constant = malloc(len); if (constant == NULL){ printf("malloc, %d\n", len); @@ -799,8 +802,8 @@ unsigned int time_total = GetTickCount(); err = encode_method2(file_path, header_buf, rcv_hFile, files, s_blk, p_blk, constant); #ifdef TIMER if (err != 1){ - time_total = GetTickCount() - time_total; - printf("total %d.%03d sec\n", time_total / 1000, time_total % 1000); + time_total = clock() - time_total; + printf("total %.3f sec\n", (double)time_total / CLOCKS_PER_SEC); } #endif @@ -830,7 +833,7 @@ int rs_encode_1pass( int err = 0; unsigned int len; #ifdef TIMER -unsigned int time_total = GetTickCount(); +clock_t time_total = clock(); #endif if (galois_create_table()){ @@ -841,7 +844,7 @@ unsigned int time_total = GetTickCount(); // パリティ計算用の行列演算の準備をする len = sizeof(unsigned short) * source_num; if (OpenCL_method != 0) - len *= 2; // GPU の作業領域も確保しておく + len *= 3; // GPU の作業領域も確保しておく constant = malloc(len); if (constant == NULL){ printf("malloc, %d\n", len); @@ -888,8 +891,8 @@ unsigned int time_total = GetTickCount(); if (err < 0){ printf("switching to 2-pass processing, %d\n", err); } else if (err != 1){ - time_total = GetTickCount() - time_total; - printf("total %d.%03d sec\n", time_total / 1000, time_total % 1000); + time_total = clock() - time_total; + printf("total %.3f sec\n", (double)time_total / CLOCKS_PER_SEC); } #endif @@ -913,7 +916,7 @@ int rs_decode( int err = 0, i, j, k; unsigned int len; #ifdef TIMER -unsigned int time_matrix = 0, time_total = GetTickCount(); +clock_t time_matrix = 0, time_total = clock(); #endif if (galois_create_table()){ @@ -948,7 +951,7 @@ unsigned int time_matrix = 0, time_total = GetTickCount(); id = mat + (block_lost * source_num); #ifdef TIMER -time_matrix = GetTickCount(); +time_matrix = clock(); #endif // 復元用の行列を計算する print_progress_text(0, "Computing matrix"); @@ -989,7 +992,7 @@ time_matrix = GetTickCount(); //for (i = 0; i < block_lost; i++) // printf("id[%d] = %d\n", i, id[i]); #ifdef TIMER -time_matrix = GetTickCount() - time_matrix; +time_matrix = clock() - time_matrix; #endif #ifdef TIMER @@ -1032,9 +1035,9 @@ time_matrix = GetTickCount() - time_matrix; err = decode_method2(file_path, block_lost, rcv_hFile, files, s_blk, p_blk, mat); #ifdef TIMER if (err != 1){ - time_total = GetTickCount() - time_total; - printf("total %d.%03d sec\n", time_total / 1000, time_total % 1000); - printf("matrix %d.%03d sec\n", time_matrix / 1000, time_matrix % 1000); + time_total = clock() - time_total; + printf("total %.3f sec\n", (double)time_total / CLOCKS_PER_SEC); + printf("matrix %.3f sec\n", (double)time_matrix / CLOCKS_PER_SEC); } #endif diff --git a/source/par2j/rs_decode.c b/source/par2j/rs_decode.c index cf41a88..59dbc18 100644 --- a/source/par2j/rs_decode.c +++ b/source/par2j/rs_decode.c @@ -1,5 +1,5 @@ // rs_decode.c -// Copyright : 2023-11-27 Yutaka Sawada +// Copyright : 2023-12-13 Yutaka Sawada // License : GPL #ifndef _UNICODE @@ -28,7 +28,9 @@ #ifdef TIMER -static unsigned int time_start, time_read = 0, time_write = 0, time_calc = 0; +#include +static double time_sec, time_speed; +static clock_t time_start, time_read = 0, time_write = 0, time_calc = 0; static unsigned int read_count, write_count = 0, skip_count; #endif @@ -60,7 +62,7 @@ static DWORD WINAPI thread_decode2(LPVOID lpParameter) RS_TH *th; #ifdef TIMER unsigned int loop_count2a = 0, loop_count2b = 0; -unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; +clock_t time_start2, time_encode2a = 0, time_encode2b = 0; #endif th = (RS_TH *)lpParameter; @@ -78,7 +80,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ while (th->now < INT_MAX / 2){ #ifdef TIMER -time_start2 = GetTickCount(); +time_start2 = clock(); #endif s_buf = th->buf; factor = th->mat; @@ -95,7 +97,7 @@ loop_count2a++; #endif } #ifdef TIMER -time_encode2a += GetTickCount() - time_start2; +time_encode2a += clock() - time_start2; #endif } else { // 消失ブロックを部分的に保持する場合 // スレッドごとに復元する消失ブロックの chunk を変える @@ -136,7 +138,7 @@ loop_count2b += src_num; #endif } #ifdef TIMER -time_encode2b += GetTickCount() - time_start2; +time_encode2b += clock() - time_start2; #endif } //_mm_sfence(); // メモリーへの書き込みを完了する @@ -146,19 +148,21 @@ time_encode2b += GetTickCount() - time_start2; #ifdef TIMER loop_count2b /= chunk_num; // chunk数で割ってブロック数にする printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b); -if (time_encode2a > 0){ - i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072)); +time_sec = (double)time_encode2a / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } if (loop_count2a > 0) - printf(" 1st decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i); -if (time_encode2b > 0){ - i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072)); + printf(" 1st decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed); +time_sec = (double)time_encode2b / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } -printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i); +printf(" 2nd decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed); #endif // 終了処理 @@ -178,7 +182,7 @@ static DWORD WINAPI thread_decode3(LPVOID lpParameter) RS_TH *th; #ifdef TIMER unsigned int loop_count2a = 0, loop_count2b = 0; -unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; +clock_t time_start2, time_encode2a = 0, time_encode2b = 0; #endif th = (RS_TH *)lpParameter; @@ -197,7 +201,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ while (th->now < INT_MAX / 2){ #ifdef TIMER -time_start2 = GetTickCount(); +time_start2 = clock(); #endif s_buf = th->buf; factor = th->mat; @@ -214,7 +218,7 @@ loop_count2a++; #endif } #ifdef TIMER -time_encode2a += GetTickCount() - time_start2; +time_encode2a += clock() - time_start2; #endif } else { // 全ての消失ブロックを保持する場合 // スレッドごとに復元する消失ブロックの chunk を変える @@ -250,7 +254,7 @@ loop_count2b += src_num; #endif } #ifdef TIMER -time_encode2b += GetTickCount() - time_start2; +time_encode2b += clock() - time_start2; #endif } //_mm_sfence(); // メモリーへの書き込みを完了する @@ -260,19 +264,21 @@ time_encode2b += GetTickCount() - time_start2; #ifdef TIMER loop_count2b /= chunk_num; // chunk数で割ってブロック数にする printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b); -if (time_encode2a > 0){ - i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072)); +time_sec = (double)time_encode2a / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } if (loop_count2a > 0) - printf(" 1st decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i); -if (time_encode2b > 0){ - i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072)); + printf(" 1st decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed); +time_sec = (double)time_encode2b / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } -printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i); +printf(" 2nd decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed); #endif // 終了処理 @@ -292,7 +298,8 @@ static DWORD WINAPI thread_decode_gpu(LPVOID lpParameter) HANDLE hRun, hEnd; RS_TH *th; #ifdef TIMER -unsigned int time_start2, time_encode2 = 0, loop_count2 = 0; +unsigned int loop_count2 = 0; +clock_t time_start2, time_encode2 = 0; #endif th = (RS_TH *)lpParameter; @@ -307,7 +314,7 @@ unsigned int time_start2, time_encode2 = 0, loop_count2 = 0; WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ while (th->now < INT_MAX / 2){ #ifdef TIMER -time_start2 = GetTickCount(); +time_start2 = clock(); #endif // GPUはソース・ブロック読み込み中に呼ばれない s_buf = th->buf; @@ -321,22 +328,58 @@ time_start2 = GetTickCount(); InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する } - // スレッドごとに復元する消失ブロックを変える - while ((j = InterlockedIncrement(&(th->now))) < block_lost){ // j = ++th_now - // 倍率は逆行列から部分的にコピーする - i = gpu_multiply_blocks(src_num, factor + source_num * j, g_buf + (size_t)unit_size * j, unit_size); - if (i != 0){ - th->len = i; - InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する - break; - } + // 一つの GPUスレッドが全ての消失ブロックを処理する + if (OpenCL_method & 8){ // 2ブロックずつ計算する + // 消失ブロック数が奇数なら、最初の一個だけ別に計算する + if (block_lost & 1){ + InterlockedIncrement(&(th->now)); // 常に j = 0 となる + + // 倍率は逆行列から部分的にコピーする + i = gpu_multiply_blocks(src_num, factor, NULL, g_buf, unit_size); + if (i != 0){ + th->len = i; + InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する + break; + } #ifdef TIMER loop_count2 += src_num; #endif + } + + // 残りのブロックは二個ずつ計算する + while ((j = InterlockedAdd(&(th->now), 2)) < block_lost){ // th_now += 2, j = th_now + j--; // +2 してるから、最初のブロックは -1 する + + // 倍率は逆行列から部分的に2回コピーする + i = gpu_multiply_blocks(src_num, factor + source_num * j, factor + source_num * (j + 1), g_buf + (size_t)unit_size * j, unit_size * 2); + if (i != 0){ + th->len = i; + InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する + break; + } +#ifdef TIMER +loop_count2 += src_num * 2; +#endif + } + + } else { // 以前からの1ブロックずつ計算する方式 + while ((j = InterlockedIncrement(&(th->now))) < block_lost){ // j = ++th_now + // 倍率は逆行列から部分的にコピーする(2ブロックずつの場合はブロック数をマイナスにする) + i = gpu_multiply_blocks(src_num, factor + source_num * j, NULL, g_buf + (size_t)unit_size * j, unit_size); + if (i != 0){ + th->len = i; + InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する + break; + } + +#ifdef TIMER +loop_count2 += src_num; +#endif + } } #ifdef TIMER -time_encode2 += GetTickCount() - time_start2; +time_encode2 += clock() - time_start2; #endif // 最後にVRAMを解放する i = gpu_finish(); @@ -349,12 +392,13 @@ time_encode2 += GetTickCount() - time_start2; } #ifdef TIMER printf("gpu-thread :\n"); -if (time_encode2 > 0){ - i = (int)((__int64)loop_count2 * unit_size * 125 / ((__int64)time_encode2 * 131072)); +time_sec = (double)time_encode2 / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2 * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } -printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2 / 1000, time_encode2 % 1000, loop_count2, i); +printf(" 2nd decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2, time_speed); #endif // 終了処理 @@ -430,7 +474,7 @@ int decode_method1( // ソース・ブロックが一個だけの場合 block_off = 0; while (block_off < block_size){ #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // パリティ・ブロックを読み込む len = block_size - block_off; @@ -447,18 +491,18 @@ time_start = GetTickCount(); // パリティ・ブロックのチェックサムを計算する checksum16_altmap(buf, buf + io_size, io_size); #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // 失われたソース・ブロックを復元する memset(work_buf, 0, unit_size); // factor で割ると元に戻る galois_align_multiply(buf, work_buf, unit_size, galois_divide(1, galois_power(2, id))); #ifdef TIMER -time_calc += GetTickCount() - time_start; +time_calc += clock() - time_start; #endif // 経過表示 @@ -472,7 +516,7 @@ time_calc += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // 復元されたソース・ブロックのチェックサムを検証する checksum16_return(work_buf, hash, io_size); @@ -491,7 +535,7 @@ time_start = GetTickCount(); goto error_end; } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif block_off += io_size; @@ -499,9 +543,9 @@ time_write += GetTickCount() - time_start; print_progress_done(); // 末尾ブロックの断片化によっては 100% で完了するとは限らない #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); -printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); -printf("decode %d.%03d sec\n", time_calc / 1000, time_calc % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); +printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC); +printf("decode %.3f sec\n", (double)time_calc / CLOCKS_PER_SEC); #endif error_end: @@ -623,7 +667,7 @@ int decode_method2( // ソース・データを全て読み込む場合 #ifdef TIMER read_count = 0; skip_count = 0; -time_start = GetTickCount(); +time_start = clock(); #endif last_file = -1; recv_now = 0; // 何番目の代替ブロックか @@ -760,7 +804,7 @@ skip_count++; hFile = NULL; } #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ @@ -845,7 +889,7 @@ skip_count++; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // 復元されたブロックを書き込む work_buf = p_buf; @@ -916,7 +960,7 @@ write_count++; } } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif part_off += part_num; // 次の消失ブロック位置にする @@ -930,9 +974,9 @@ time_write += GetTickCount() - time_start; print_progress_done(); #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); j = ((block_size + io_size - 1) / io_size) * block_lost; -printf("write %d.%03d sec, count = %d/%d\n", time_write / 1000, time_write % 1000, write_count, j); +printf("write %.3f sec, count = %d/%d\n", (double)time_write / CLOCKS_PER_SEC, write_count, j); if (prog_num != prog_base) printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); #endif @@ -1063,7 +1107,7 @@ int decode_method3( // 復元するブロックを全て保持できる場合 #ifdef TIMER read_count = 0; -time_start = GetTickCount(); +time_start = clock(); #endif last_file = -1; for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく @@ -1173,7 +1217,7 @@ read_count++; hFile = NULL; } #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ @@ -1238,7 +1282,7 @@ time_read += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // 復元されたブロックを書き込む work_buf = p_buf; @@ -1297,7 +1341,7 @@ time_start = GetTickCount(); } } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif // 最後の書き込みファイルを閉じる CloseHandle(hFile); @@ -1305,8 +1349,8 @@ time_write += GetTickCount() - time_start; print_progress_done(); #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); -printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); +printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC); if (prog_num != prog_base) printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); #endif @@ -1463,7 +1507,7 @@ int decode_method4( // 全てのブロックを断片的に保持する場合 (G #ifdef TIMER read_count = 0; skip_count = 0; -time_start = GetTickCount(); +time_start = clock(); #endif last_file = -1; recv_now = 0; // 何番目の代替ブロックか @@ -1600,7 +1644,7 @@ skip_count++; hFile = NULL; } #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif memset(g_buf, 0, (size_t)unit_size * block_lost); // 待機中に GPU用の領域をゼロ埋めしておく @@ -1845,7 +1889,7 @@ skip_count++; prog_num += th->size * block_lost; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // 復元されたブロックを書き込む work_buf = p_buf; @@ -1918,7 +1962,7 @@ write_count++; } } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif block_off += io_size; @@ -1929,9 +1973,9 @@ time_write += GetTickCount() - time_start; print_progress_done(); #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); j = ((block_size + io_size - 1) / io_size) * block_lost; -printf("write %d.%03d sec, count = %d/%d\n", time_write / 1000, time_write % 1000, write_count, j); +printf("write %.3f sec, count = %d/%d\n", (double)time_write / CLOCKS_PER_SEC, write_count, j); if (prog_num != prog_base) printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); #endif @@ -2096,7 +2140,7 @@ int decode_method5( // 復元するブロックだけ保持する場合 (GPU対 #ifdef TIMER read_count = 0; -time_start = GetTickCount(); +time_start = clock(); #endif last_file = -1; for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく @@ -2206,7 +2250,7 @@ read_count++; hFile = NULL; } #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif if (source_off == 0) @@ -2446,7 +2490,7 @@ time_read += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // 復元されたブロックを書き込む work_buf = p_buf; @@ -2507,7 +2551,7 @@ time_start = GetTickCount(); } } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif // 最後の書き込みファイルを閉じる CloseHandle(hFile); @@ -2515,8 +2559,8 @@ time_write += GetTickCount() - time_start; print_progress_done(); #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); -printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); +printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC); if (prog_num != prog_base) printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); #endif diff --git a/source/par2j/rs_encode.c b/source/par2j/rs_encode.c index 969a4a0..63acf9c 100644 --- a/source/par2j/rs_encode.c +++ b/source/par2j/rs_encode.c @@ -1,5 +1,5 @@ // rs_encode.c -// Copyright : 2023-11-25 Yutaka Sawada +// Copyright : 2023-12-18 Yutaka Sawada // License : GPL #ifndef _UNICODE @@ -29,7 +29,9 @@ #ifdef TIMER -static unsigned int time_start, time_read = 0, time_write = 0, time_calc = 0; +#include +static double time_sec, time_speed; +static clock_t time_start, time_read = 0, time_write = 0, time_calc = 0; static unsigned int read_count, skip_count; #endif @@ -61,7 +63,7 @@ static DWORD WINAPI thread_encode2(LPVOID lpParameter) RS_TH *th; #ifdef TIMER unsigned int loop_count2a = 0, loop_count2b = 0; -unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; +clock_t time_start2, time_encode2a = 0, time_encode2b = 0; #endif th = (RS_TH *)lpParameter; @@ -80,7 +82,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ while (th->now < INT_MAX / 2){ #ifdef TIMER -time_start2 = GetTickCount(); +time_start2 = clock(); #endif s_buf = th->buf; src_off = th->off; // ソース・ブロック番号 @@ -98,7 +100,7 @@ loop_count2a++; } #ifdef TIMER -time_encode2a += GetTickCount() - time_start2; +time_encode2a += clock() - time_start2; #endif } else { // パリティ・ブロックを部分的に保持する場合 // スレッドごとに作成するパリティ・ブロックの chunk を変える @@ -143,7 +145,7 @@ loop_count2b += src_num; #endif } #ifdef TIMER -time_encode2b += GetTickCount() - time_start2; +time_encode2b += clock() - time_start2; #endif } //_mm_sfence(); // メモリーへの書き込みを完了する @@ -153,19 +155,21 @@ time_encode2b += GetTickCount() - time_start2; #ifdef TIMER loop_count2b /= chunk_num; // chunk数で割ってブロック数にする printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b); -if (time_encode2a > 0){ - i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072)); +time_sec = (double)time_encode2a / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } if (loop_count2a > 0) - printf(" 1st encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i); -if (time_encode2b > 0){ - i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072)); + printf(" 1st encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed); +time_sec = (double)time_encode2b / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } -printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i); +printf(" 2nd encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed); #endif // 終了処理 @@ -185,7 +189,7 @@ static DWORD WINAPI thread_encode3(LPVOID lpParameter) RS_TH *th; #ifdef TIMER unsigned int loop_count2a = 0, loop_count2b = 0; -unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; +clock_t time_start2, time_encode2a = 0, time_encode2b = 0; #endif th = (RS_TH *)lpParameter; @@ -204,7 +208,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0; WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ while (th->now < INT_MAX / 2){ #ifdef TIMER -time_start2 = GetTickCount(); +time_start2 = clock(); #endif s_buf = th->buf; src_off = th->off; // ソース・ブロック番号 @@ -221,7 +225,7 @@ loop_count2a++; #endif } #ifdef TIMER -time_encode2a += GetTickCount() - time_start2; +time_encode2a += clock() - time_start2; #endif } else { // 全てのパリティ・ブロックを保持する場合 // スレッドごとに作成するパリティ・ブロックの chunk を変える @@ -261,7 +265,7 @@ loop_count2b += src_num; #endif } #ifdef TIMER -time_encode2b += GetTickCount() - time_start2; +time_encode2b += clock() - time_start2; #endif } //_mm_sfence(); // メモリーへの書き込みを完了する @@ -271,19 +275,21 @@ time_encode2b += GetTickCount() - time_start2; #ifdef TIMER loop_count2b /= chunk_num; // chunk数で割ってブロック数にする printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b); -if (time_encode2a > 0){ - i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072)); +time_sec = (double)time_encode2a / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } if (loop_count2a > 0) - printf(" 1st encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i); -if (time_encode2b > 0){ - i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072)); + printf(" 1st encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed); +time_sec = (double)time_encode2b / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } -printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i); +printf(" 2nd encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed); #endif // 終了処理 @@ -303,7 +309,8 @@ static DWORD WINAPI thread_encode_gpu(LPVOID lpParameter) HANDLE hRun, hEnd; RS_TH *th; #ifdef TIMER -unsigned int time_start2, time_encode2 = 0, loop_count2 = 0; +unsigned int loop_count2 = 0; +clock_t time_start2, time_encode2 = 0; #endif th = (RS_TH *)lpParameter; @@ -320,7 +327,7 @@ unsigned int time_start2, time_encode2 = 0, loop_count2 = 0; WaitForSingleObject(hRun, INFINITE); // 計算開始の合図を待つ while (th->now < INT_MAX / 2){ #ifdef TIMER -time_start2 = GetTickCount(); +time_start2 = clock(); #endif // GPUはソース・ブロック読み込み中に呼ばれない s_buf = th->buf; @@ -335,24 +342,71 @@ time_start2 = GetTickCount(); } // 一つの GPUスレッドが全てのパリティ・ブロックを処理する - while ((j = InterlockedIncrement(&(th->now))) < parity_num){ // j = ++th_now - // factor は定数行列の乗数になる - for (i = 0; i < src_num; i++) - factor[i] = galois_power(constant[src_off + i], first_num + j); + if (OpenCL_method & 8){ // 2ブロックずつ計算する + // パリティ・ブロック数が奇数なら、最初の一個だけ別に計算する + if (parity_num & 1){ + InterlockedIncrement(&(th->now)); // 常に j = 0 となる - // VRAM上のソース・ブロックごとにパリティを追加していく - i = gpu_multiply_blocks(src_num, factor, g_buf + (size_t)unit_size * j, unit_size); - if (i != 0){ - th->len = i; - InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する - break; - } + // factor は定数行列の乗数になる + for (i = 0; i < src_num; i++) + factor[i] = galois_power(constant[src_off + i], first_num); + + // VRAM上のソース・ブロックごとにパリティを追加していく + i = gpu_multiply_blocks(src_num, factor, NULL, g_buf, unit_size); + if (i != 0){ + th->len = i; + InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する + break; + } #ifdef TIMER loop_count2 += src_num; #endif - } + } + + // 残りのブロックは二個ずつ計算する + while ((j = InterlockedAdd(&(th->now), 2)) < parity_num){ // th_now += 2, j = th_now + j--; // +2 してるから、最初のブロックは -1 する + + // factor は定数行列の乗数になる + for (i = 0; i < src_num; i++){ + int c = constant[src_off + i]; // 同じ定数だけど、何乗するかが異なる + factor[i] = galois_power(c, first_num + j); + factor[src_num + i] = galois_power(c, first_num + j + 1); + } + + // VRAM上のソース・ブロックごとにパリティを追加していく + i = gpu_multiply_blocks(src_num, factor, (void *)1, g_buf + (size_t)unit_size * j, unit_size * 2); + if (i != 0){ + th->len = i; + InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する + break; + } #ifdef TIMER -time_encode2 += GetTickCount() - time_start2; +loop_count2 += src_num * 2; +#endif + } + + } else { // 以前からの1ブロックずつ計算する方式 + while ((j = InterlockedIncrement(&(th->now))) < parity_num){ // j = ++th_now + // factor は定数行列の乗数になる + for (i = 0; i < src_num; i++) + factor[i] = galois_power(constant[src_off + i], first_num + j); + + // VRAM上のソース・ブロックごとにパリティを追加していく + i = gpu_multiply_blocks(src_num, factor, NULL, g_buf + (size_t)unit_size * j, unit_size); + if (i != 0){ + th->len = i; + InterlockedExchange(&(th->now), INT_MAX / 3); // サブ・スレッドの計算を中断する + break; + } +#ifdef TIMER +loop_count2 += src_num; +#endif + } + } + +#ifdef TIMER +time_encode2 += clock() - time_start2; #endif // 最後にVRAMを解放する i = gpu_finish(); @@ -365,12 +419,13 @@ time_encode2 += GetTickCount() - time_start2; } #ifdef TIMER printf("gpu-thread :\n"); -if (time_encode2 > 0){ - i = (int)((__int64)loop_count2 * unit_size * 125 / ((__int64)time_encode2 * 131072)); +time_sec = (double)time_encode2 / CLOCKS_PER_SEC; +if (time_sec > 0){ + time_speed = ((double)loop_count2 * unit_size) / (time_sec * 1048576); } else { - i = 0; + time_speed = 0; } -printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2 / 1000, time_encode2 % 1000, loop_count2, i); +printf(" 2nd encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2, time_speed); #endif // 終了処理 @@ -452,7 +507,7 @@ int encode_method1( // ソース・ブロックが一個だけの場合 block_off = 0; while (block_off < block_size){ #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // ソース・ブロックを読み込む len = s_blk[0].size - block_off; @@ -469,7 +524,7 @@ time_start = GetTickCount(); s_blk[0].crc = crc_update(s_blk[0].crc, buf, len); // without pad checksum16_altmap(buf, buf + io_size, io_size); #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif // リカバリ・ファイルに書き込むサイズ @@ -482,13 +537,13 @@ time_read += GetTickCount() - time_start; // パリティ・ブロックごとに for (i = 0; i < parity_num; i++){ #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif memset(work_buf, 0, unit_size); // factor は 2の乗数になる galois_align_multiply(buf, work_buf, unit_size, galois_power(2, first_num + i)); #ifdef TIMER -time_calc += GetTickCount() - time_start; +time_calc += clock() - time_start; #endif // 経過表示 @@ -502,7 +557,7 @@ time_calc += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // パリティ・ブロックのチェックサムを検証する checksum16_return(work_buf, hash, io_size); @@ -535,7 +590,7 @@ time_start = GetTickCount(); goto error_end; } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif } @@ -565,7 +620,7 @@ time_write += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // 最後に Recovery Slice packet のヘッダーを書き込む for (i = 0; i < parity_num; i++){ @@ -581,14 +636,14 @@ time_start = GetTickCount(); } } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif } #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); -printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); -printf("encode %d.%03d sec\n", time_calc / 1000, time_calc % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); +printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC); +printf("encode %.3f sec\n", (double)time_calc / CLOCKS_PER_SEC); #endif error_end: @@ -729,7 +784,7 @@ int encode_method2( // ソース・データを全て読み込む場合 #ifdef TIMER read_count = 0; skip_count = 0; -time_start = GetTickCount(); +time_start = clock(); #endif last_file = -1; for (i = 0; i < source_num; i++){ @@ -830,7 +885,7 @@ skip_count++; CloseHandle(hFile); hFile = NULL; #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ @@ -930,7 +985,7 @@ skip_count++; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // パリティ・ブロックを書き込む work_buf = p_buf; @@ -979,7 +1034,7 @@ time_start = GetTickCount(); } } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif part_off += part_num; // 次のパリティ位置にする @@ -1025,7 +1080,7 @@ time_write += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // 最後に Recovery Slice packet のヘッダーを書き込む for (i = 0; i < parity_num; i++){ @@ -1041,13 +1096,13 @@ time_start = GetTickCount(); } } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif } #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); -printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); +printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC); if (prog_num != prog_base) printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); #endif @@ -1186,7 +1241,7 @@ int encode_method3( // パリティ・ブロックを全て保持して、一度 src_off = source_off - 1; // まだ計算して無い印 #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく // ソース・ブロックを読み込む @@ -1318,7 +1373,7 @@ time_start = GetTickCount(); memcpy(common_buf + packet_off + 16, file_md_ctx.hash, 16); } #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE); // サブ・スレッドの計算終了の合図を待つ @@ -1393,19 +1448,19 @@ time_read += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif memcpy(common_buf + common_size, common_buf, common_size); // 後の半分に前半のをコピーする // 最後にパリティ・ブロックのチェックサムを検証して、リカバリ・ファイルに書き込む err = create_recovery_file_1pass(file_path, recovery_path, packet_limit, block_distri, packet_num, common_buf, common_size, footer_buf, footer_size, rcv_hFile, p_buf, NULL, unit_size); #ifdef TIMER -time_write = GetTickCount() - time_start; +time_write = clock() - time_start; #endif #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); -printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); +printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC); if (prog_num != prog_base - prog_write * parity_num) printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num); #endif @@ -1577,7 +1632,7 @@ int encode_method4( // 全てのブロックを断片的に保持する場合 (G #ifdef TIMER read_count = 0; skip_count = 0; -time_start = GetTickCount(); +time_start = clock(); #endif last_file = -1; for (i = 0; i < source_num; i++){ @@ -1678,7 +1733,7 @@ skip_count++; CloseHandle(hFile); hFile = NULL; #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif memset(g_buf, 0, (size_t)unit_size * parity_num); // 待機中に GPU用の領域をゼロ埋めしておく @@ -1931,7 +1986,7 @@ skip_count++; prog_num += th->size * parity_num; #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // パリティ・ブロックを書き込む work_buf = p_buf; @@ -1982,7 +2037,7 @@ time_start = GetTickCount(); } } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif block_off += io_size; @@ -2025,7 +2080,7 @@ time_write += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif // 最後に Recovery Slice packet のヘッダーを書き込む for (i = 0; i < parity_num; i++){ @@ -2041,13 +2096,13 @@ time_start = GetTickCount(); } } #ifdef TIMER -time_write += GetTickCount() - time_start; +time_write += clock() - time_start; #endif } #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); -printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); +printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC); if (prog_num != prog_base) printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base); #endif @@ -2220,7 +2275,7 @@ int encode_method5( // ソース・ブロックの一部とパリティ・ブロ src_off = source_off - 1; // まだ計算して無い印 #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif for (i = 0; i < read_num; i++){ // スライスを一個ずつ読み込んでメモリー上に配置していく // ソース・ブロックを読み込む @@ -2351,7 +2406,7 @@ time_start = GetTickCount(); memcpy(common_buf + packet_off + 16, file_md_ctx.hash, 16); } #ifdef TIMER -time_read += GetTickCount() - time_start; +time_read += clock() - time_start; #endif if (source_off == 0) @@ -2589,19 +2644,19 @@ time_read += GetTickCount() - time_start; } #ifdef TIMER -time_start = GetTickCount(); +time_start = clock(); #endif memcpy(common_buf + common_size, common_buf, common_size); // 後の半分に前半のをコピーする // 最後にパリティ・ブロックのチェックサムを検証して、リカバリ・ファイルに書き込む err = create_recovery_file_1pass(file_path, recovery_path, packet_limit, block_distri, packet_num, common_buf, common_size, footer_buf, footer_size, rcv_hFile, p_buf, g_buf, unit_size); #ifdef TIMER -time_write = GetTickCount() - time_start; +time_write = clock() - time_start; #endif #ifdef TIMER -printf("read %d.%03d sec\n", time_read / 1000, time_read % 1000); -printf("write %d.%03d sec\n", time_write / 1000, time_write % 1000); +printf("read %.3f sec\n", (double)time_read / CLOCKS_PER_SEC); +printf("write %.3f sec\n", (double)time_write / CLOCKS_PER_SEC); if (prog_num != prog_base - prog_write * parity_num) printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num); #endif diff --git a/source/par2j/source.cl b/source/par2j/source.cl index 11279d3..d5af15d 100644 --- a/source/par2j/source.cl +++ b/source/par2j/source.cl @@ -1,10 +1,14 @@ void calc_table(__local uint *mtab, int id, int factor) { - int i, sum = 0; + int i, sum, mask; - for (i = 0; i < 8; i++){ - sum = (id & (1 << i)) ? (sum ^ factor) : sum; - factor = (factor & 0x8000) ? ((factor << 1) ^ 0x1100B) : (factor << 1); + mask = (id & 1) ? 0xFFFF : 0; + sum = mask & factor; + for (i = 1; i < 8; i++){ + mask = (factor & 0x8000) ? 0x1100B : 0; + factor = (factor << 1) ^ mask; + mask = (id & (1 << i)) ? 0xFFFF : 0; + sum ^= mask & factor; } mtab[id] = sum; @@ -14,6 +18,32 @@ void calc_table(__local uint *mtab, int id, int factor) mtab[id + 256] = sum; } +void calc_table2(__local uint *mtab, int id, int factor, int factor2) +{ + int i, sum, sum2, mask; + + mask = (id & 1) ? 0xFFFF : 0; + sum = mask & factor; + sum2 = mask & factor2; + for (i = 1; i < 8; i++){ + mask = (factor & 0x8000) ? 0x1100B : 0; + factor = (factor << 1) ^ mask; + mask = (factor2 & 0x8000) ? 0x1100B : 0; + factor2 = (factor2 << 1) ^ mask; + mask = (id & (1 << i)) ? 0xFFFF : 0; + sum ^= mask & factor; + sum2 ^= mask & factor2; + } + mtab[id] = sum | (sum2 << 16); + + sum = (sum << 4) ^ (((sum << 16) >> 31) & 0x88058) ^ (((sum << 17) >> 31) & 0x4402C) ^ (((sum << 18) >> 31) & 0x22016) ^ (((sum << 19) >> 31) & 0x1100B); + sum = (sum << 4) ^ (((sum << 16) >> 31) & 0x88058) ^ (((sum << 17) >> 31) & 0x4402C) ^ (((sum << 18) >> 31) & 0x22016) ^ (((sum << 19) >> 31) & 0x1100B); + sum2 = (sum2 << 4) ^ (((sum2 << 16) >> 31) & 0x88058) ^ (((sum2 << 17) >> 31) & 0x4402C) ^ (((sum2 << 18) >> 31) & 0x22016) ^ (((sum2 << 19) >> 31) & 0x1100B); + sum2 = (sum2 << 4) ^ (((sum2 << 16) >> 31) & 0x88058) ^ (((sum2 << 17) >> 31) & 0x4402C) ^ (((sum2 << 18) >> 31) & 0x22016) ^ (((sum2 << 19) >> 31) & 0x1100B); + + mtab[id + 256] = sum | (sum2 << 16); +} + __kernel void method1( __global uint *src, __global uint *dst, @@ -31,6 +61,7 @@ __kernel void method1( dst[i] = 0; for (blk = 0; blk < blk_num; blk++){ + barrier(CLK_LOCAL_MEM_FENCE); calc_table(mtab, table_id, factors[blk]); barrier(CLK_LOCAL_MEM_FENCE); @@ -42,7 +73,6 @@ __kernel void method1( dst[i] ^= sum; } src += BLK_SIZE; - barrier(CLK_LOCAL_MEM_FENCE); } } @@ -65,6 +95,7 @@ __kernel void method2( } for (blk = 0; blk < blk_num; blk++){ + barrier(CLK_LOCAL_MEM_FENCE); calc_table(mtab, table_id, factors[blk]); barrier(CLK_LOCAL_MEM_FENCE); @@ -82,11 +113,10 @@ __kernel void method2( dst[pos + 4] ^= ((sum1 & 0xFF00FF00) >> 8) | (sum2 & 0xFF00FF00); } src += BLK_SIZE; - barrier(CLK_LOCAL_MEM_FENCE); } } -__kernel void method3( +__kernel void method4( __global uint4 *src, __global uint4 *dst, __global ushort *factors, @@ -106,6 +136,7 @@ __kernel void method3( } for (blk = 0; blk < blk_num; blk++){ + barrier(CLK_LOCAL_MEM_FENCE); calc_table(mtab, table_id, factors[blk]); barrier(CLK_LOCAL_MEM_FENCE); @@ -124,11 +155,140 @@ __kernel void method3( dst[i + 1] ^= as_uint4((uchar16)(r0.y, r0.w, r1.y, r1.w, r2.y, r2.w, r3.y, r3.w, r4.y, r4.w, r5.y, r5.w, r6.y, r6.w, r7.y, r7.w)); } src += BLK_SIZE / 4; - barrier(CLK_LOCAL_MEM_FENCE); } } -__kernel void method4( +__kernel void method9( + __global uint *src, + __global uint *dst, + __global ushort *factors, + int blk_num) +{ + __local uint mtab[512]; + int i, blk; + uint v, sum, sum2; + const int work_id = get_global_id(0); + const int work_size = get_global_size(0); + const int table_id = get_local_id(0); + + for (i = work_id; i < BLK_SIZE; i += work_size){ + dst[i] = 0; + dst[i + BLK_SIZE] = 0; + } + + for (blk = 0; blk < blk_num; blk++){ + barrier(CLK_LOCAL_MEM_FENCE); + calc_table2(mtab, table_id, factors[blk], factors[blk_num + blk]); + barrier(CLK_LOCAL_MEM_FENCE); + + for (i = work_id; i < BLK_SIZE; i += work_size){ + v = src[i]; + sum = mtab[(uchar)v] ^ mtab[256 + (uchar)(v >> 8)]; + sum2 = mtab[(uchar)(v >> 16)] ^ mtab[256 + (v >> 24)]; + dst[i] ^= (sum & 0xFFFF) | (sum2 << 16); + dst[i + BLK_SIZE] ^= (sum >> 16) | (sum2 & 0xFFFF0000); + } + src += BLK_SIZE; + } +} + +__kernel void method10( + __global uint *src, + __global uint *dst, + __global ushort *factors, + int blk_num) +{ + __local uint mtab[512]; + int i, blk, pos; + uint lo, hi, sum1, sum2, sum3, sum4; + const int work_id = get_global_id(0) * 2; + const int work_size = get_global_size(0) * 2; + const int table_id = get_local_id(0); + + for (i = work_id; i < BLK_SIZE; i += work_size){ + dst[i ] = 0; + dst[i + 1] = 0; + dst[i + BLK_SIZE ] = 0; + dst[i + BLK_SIZE + 1] = 0; + } + + for (blk = 0; blk < blk_num; blk++){ + barrier(CLK_LOCAL_MEM_FENCE); + calc_table2(mtab, table_id, factors[blk], factors[blk_num + blk]); + barrier(CLK_LOCAL_MEM_FENCE); + + for (i = work_id; i < BLK_SIZE; i += work_size){ + pos = (i & ~7) + ((i & 7) >> 1); + lo = src[pos ]; + hi = src[pos + 4]; + sum1 = mtab[(uchar)lo] ^ mtab[256 + (uchar)hi]; + sum2 = mtab[(uchar)(lo >> 8)] ^ mtab[256 + (uchar)(hi >> 8)]; + sum3 = mtab[(uchar)(lo >> 16)] ^ mtab[256 + (uchar)(hi >> 16)]; + sum4 = mtab[lo >> 24] ^ mtab[256 + (hi >> 24)]; + dst[pos ] ^= (sum1 & 0xFF) | ((sum2 & 0xFF) << 8) | ((sum3 & 0xFF) << 16) | (sum4 << 24); + dst[pos + 4] ^= ((sum1 >> 8) & 0xFF) | (sum2 & 0xFF00) | ((sum3 & 0xFF00) << 8) | ((sum4 & 0xFF00) << 16); + dst[pos + BLK_SIZE ] ^= ((sum1 >> 16) & 0xFF) | ((sum2 >> 8) & 0xFF00) | (sum3 & 0xFF0000) | ((sum4 & 0xFF0000) << 8); + dst[pos + BLK_SIZE + 4] ^= (sum1 >> 24) | ((sum2 >> 16) & 0xFF00) | ((sum3 >> 8) & 0xFF0000) | (sum4 & 0xFF000000); + } + src += BLK_SIZE; + } +} + +__kernel void method12( + __global uint4 *src, + __global uint4 *dst, + __global ushort *factors, + int blk_num) +{ + __local uint mtab[512]; + int i, blk; + uchar4 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB, rC, rD, rE, rF; + uchar16 lo, hi; + const int work_id = get_global_id(0) * 2; + const int work_size = get_global_size(0) * 2; + const int table_id = get_local_id(0); + + for (i = work_id; i < BLK_SIZE / 4; i += work_size){ + dst[i ] = 0; + dst[i + 1] = 0; + dst[i + BLK_SIZE / 4 ] = 0; + dst[i + BLK_SIZE / 4 + 1] = 0; + } + + for (blk = 0; blk < blk_num; blk++){ + barrier(CLK_LOCAL_MEM_FENCE); + calc_table2(mtab, table_id, factors[blk], factors[blk_num + blk]); + barrier(CLK_LOCAL_MEM_FENCE); + + for (i = work_id; i < BLK_SIZE / 4; i += work_size){ + lo = as_uchar16(src[i ]); + hi = as_uchar16(src[i + 1]); + r0 = as_uchar4(mtab[lo.s0] ^ mtab[256 + hi.s0]); + r1 = as_uchar4(mtab[lo.s1] ^ mtab[256 + hi.s1]); + r2 = as_uchar4(mtab[lo.s2] ^ mtab[256 + hi.s2]); + r3 = as_uchar4(mtab[lo.s3] ^ mtab[256 + hi.s3]); + r4 = as_uchar4(mtab[lo.s4] ^ mtab[256 + hi.s4]); + r5 = as_uchar4(mtab[lo.s5] ^ mtab[256 + hi.s5]); + r6 = as_uchar4(mtab[lo.s6] ^ mtab[256 + hi.s6]); + r7 = as_uchar4(mtab[lo.s7] ^ mtab[256 + hi.s7]); + r8 = as_uchar4(mtab[lo.s8] ^ mtab[256 + hi.s8]); + r9 = as_uchar4(mtab[lo.s9] ^ mtab[256 + hi.s9]); + rA = as_uchar4(mtab[lo.sa] ^ mtab[256 + hi.sa]); + rB = as_uchar4(mtab[lo.sb] ^ mtab[256 + hi.sb]); + rC = as_uchar4(mtab[lo.sc] ^ mtab[256 + hi.sc]); + rD = as_uchar4(mtab[lo.sd] ^ mtab[256 + hi.sd]); + rE = as_uchar4(mtab[lo.se] ^ mtab[256 + hi.se]); + rF = as_uchar4(mtab[lo.sf] ^ mtab[256 + hi.sf]); + dst[i ] ^= as_uint4((uchar16)(r0.x, r1.x, r2.x, r3.x, r4.x, r5.x, r6.x, r7.x, r8.x, r9.x, rA.x, rB.x, rC.x, rD.x, rE.x, rF.x)); + dst[i + 1] ^= as_uint4((uchar16)(r0.y, r1.y, r2.y, r3.y, r4.y, r5.y, r6.y, r7.y, r8.y, r9.y, rA.y, rB.y, rC.y, rD.y, rE.y, rF.y)); + dst[i + BLK_SIZE / 4 ] ^= as_uint4((uchar16)(r0.z, r1.z, r2.z, r3.z, r4.z, r5.z, r6.z, r7.z, r8.z, r9.z, rA.z, rB.z, rC.z, rD.z, rE.z, rF.z)); + dst[i + BLK_SIZE / 4 + 1] ^= as_uint4((uchar16)(r0.w, r1.w, r2.w, r3.w, r4.w, r5.w, r6.w, r7.w, r8.w, r9.w, rA.w, rB.w, rC.w, rD.w, rE.w, rF.w)); + } + src += BLK_SIZE / 4; + } +} + +__kernel void method16( __global uint *src, __global uint *dst, __global ushort *factors, @@ -136,7 +296,7 @@ __kernel void method4( { __local int table[16]; __local uint cache[256]; - int i, j, blk, pos, sht, mask; + int i, j, blk, pos, mask, tmp; uint sum; const int work_id = get_global_id(0); const int work_size = get_global_size(0); @@ -146,11 +306,12 @@ __kernel void method4( for (blk = 0; blk < blk_num; blk++){ if (get_local_id(0) == 0){ - pos = factors[blk] << 16; - table[0] = pos; + tmp = factors[blk]; + table[0] = tmp; for (j = 1; j < 16; j++){ - pos = (pos << 1) ^ ((pos >> 31) & 0x100B0000); - table[j] = pos; + mask = (tmp & 0x8000) ? 0x1100B : 0; + tmp = (tmp << 1) ^ mask; + table[j] = tmp; } } barrier(CLK_LOCAL_MEM_FENCE); @@ -161,10 +322,11 @@ __kernel void method4( barrier(CLK_LOCAL_MEM_FENCE); sum = 0; - sht = (i & 60) >> 2; + tmp = (i & 60) >> 2; + tmp = 0x8000 >> tmp; pos &= ~60; for (j = 15; j >= 0; j--){ - mask = (table[j] << sht) >> 31; + mask = (table[j] & tmp) ? 0xFFFFFFFF : 0; sum ^= mask & cache[pos]; pos += 4; }