From 1552fb8ec8a5d1fe8e698552aa8dae4569eab523 Mon Sep 17 00:00:00 2001
From: Yutaka Sawada <60930312+Yutaka-Sawada@users.noreply.github.com>
Date: Tue, 26 Dec 2023 18:53:12 +0900
Subject: [PATCH] Add files via upload

---
 source/par2j/Command_par2j.txt |  24 +-
 source/par2j/create.c          |  47 ++--
 source/par2j/gf16.c            |   2 +-
 source/par2j/gf16.h            |   2 +-
 source/par2j/lib_opencl.c      | 418 ++++++++++++++++++++++++---------
 source/par2j/lib_opencl.h      |   7 +-
 source/par2j/list.c            |  35 +--
 source/par2j/md5_crc.c         | 161 +++++++------
 source/par2j/par2_cmd.c        |  10 +-
 source/par2j/reedsolomon.c     |  33 +--
 source/par2j/rs_decode.c       | 194 +++++++++------
 source/par2j/rs_encode.c       | 223 +++++++++++-------
 source/par2j/source.cl         | 194 +++++++++++++--
 13 files changed, 920 insertions(+), 430 deletions(-)

diff --git a/source/par2j/Command_par2j.txt b/source/par2j/Command_par2j.txt
index 22e761f..bb52812 100644
--- a/source/par2j/Command_par2j.txt
+++ b/source/par2j/Command_par2j.txt
@@ -1,4 +1,4 @@
-[ par2j.exe - version 1.3.3.1 or later ]
+[ par2j.exe - version 1.3.3.2 or later ]
 
 Type "par2j.exe" to see version, test integrity, and show usage below.
 
@@ -369,14 +369,22 @@ The format is "/lc#" (# is from 1 to 32 as the number of using threads).
   0: It uses the number of physical Cores.
 255: It tries to use more threads than number of physical Cores.
 
- You may set additional combinations;
-+1024 to disable CLMUL (and use slower SSSE3 code),
-+2048 to disable JIT (for SSE2),
-+4096 to disable SSSE3,
-+8192 to disable AVX2,
-+256 or +512 (slower device) to enable GPU acceleration.
+ You may set additional combinations for CPU feature;
++1024 to disable CLMUL (and use slower SSSE3 code)
++2048 to disable JIT (for SSE2)
++4096 to disable SSSE3
++8192 to disable AVX2
 
- for example,  /lc1 to use single Core, /lc45 to use half Cores and GPU
+ You may set additional combinations for GPU control;
++256 or +512 (slower device) to enable GPU acceleration
++65536 for classic method
++131072 for 16-byte memory access
++262144 for 4-byte memory access and calculate 2 blocks at once
++524288 for 16-byte memory access and calculate 2 blocks at once
++1048576 for CL_MEM_COPY_HOST_PTR or +2097152 for CL_MEM_USE_HOST_PTR
+(When you set exclusive bits, larger value will be used.)
+
+ for example,  /lc1 to use single Core, /lc508 to use half Cores and GPU
 
  /m :
  Set this, if you want to set memory usage.
diff --git a/source/par2j/create.c b/source/par2j/create.c
index 113239b..54ebd88 100644
--- a/source/par2j/create.c
+++ b/source/par2j/create.c
@@ -1,5 +1,5 @@
 ﻿// create.c
-// Copyright : 2023-10-22 Yutaka Sawada
+// Copyright : 2023-12-12 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -26,6 +26,11 @@
 
 //#define TIMER // 実験用
 
+#ifdef TIMER
+#include <time.h>
+static double time_sec, time_speed;
+#endif
+
 // ソート時に項目を比較する
 static int sort_cmp(const void *elem1, const void *elem2)
 {
@@ -196,7 +201,7 @@ int set_common_packet(
 	__int64 prog_now = 0;
 
 #ifdef TIMER
-unsigned int time_start = GetTickCount();
+clock_t time_start = clock();
 #endif
 	print_progress_text(0, "Computing file hash");
 
@@ -305,14 +310,14 @@ unsigned int time_start = GetTickCount();
 	off += (64 + main_packet_size);
 
 #ifdef TIMER
-time_start = GetTickCount() - time_start;
-printf("hash %d.%03d sec", time_start / 1000, time_start % 1000);
-if (time_start > 0){
-	time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072));
-	printf(", %d MB/s\n", time_start);
+time_start = clock() - time_start;
+time_sec = (double)time_start / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = (double)total_file_size / (time_sec * 1048576);
 } else {
-	printf("\n");
+	time_speed = 0;
 }
+printf("hash %.3f sec, %.0f MB/s\n", time_sec, time_speed);
 #endif
 
 error_end:
@@ -341,7 +346,7 @@ int set_common_packet_multi(
 	FILE_HASH_TH th[MAX_MULTI_READ];
 
 #ifdef TIMER
-unsigned int time_start = GetTickCount();
+clock_t time_start = clock();
 #endif
 
 	memset(hSub, 0, sizeof(HANDLE) * MAX_MULTI_READ);
@@ -545,14 +550,14 @@ unsigned int time_start = GetTickCount();
 	}
 	print_progress_done();	// 改行して行の先頭に戻しておく
 #ifdef TIMER
-time_start = GetTickCount() - time_start;
-printf("hash %d.%03d sec", time_start / 1000, time_start % 1000);
-if (time_start > 0){
-	time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072));
-	printf(", %d MB/s\n", time_start);
+time_start = clock() - time_start;
+time_sec = (double)time_start / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = (double)total_file_size / (time_sec * 1048576);
 } else {
-	printf("\n");
+	time_speed = 0;
 }
+printf("hash %.3f sec, %.0f MB/s\n", time_sec, time_speed);
 #endif
 
 error_end:
@@ -700,7 +705,7 @@ int set_common_packet_hash(
 	__int64 prog_now = 0;
 
 #ifdef TIMER
-unsigned int time_start = GetTickCount();
+clock_t time_start = clock();
 #endif
 	print_progress_text(0, "Computing file hash");
 
@@ -740,8 +745,8 @@ unsigned int time_start = GetTickCount();
 	print_progress_done();	// 改行して行の先頭に戻しておく
 
 #ifdef TIMER
-time_start = GetTickCount() - time_start;
-printf("hash %d.%03d sec\n", time_start / 1000, time_start % 1000);
+time_start = clock() - time_start;
+printf("hash %.3f sec\n", (double)time_start / CLOCKS_PER_SEC);
 #endif
 	return 0;
 }
@@ -1065,7 +1070,7 @@ int create_recovery_file(
 #endif
 
 #ifdef TIMER
-unsigned int time_start = GetTickCount();
+clock_t time_start = clock();
 #endif
 	print_progress_text(0, "Constructing recovery file");
 	time_last = GetTickCount();
@@ -1258,8 +1263,8 @@ unsigned int time_start = GetTickCount();
 	print_progress_done();	// 改行して行の先頭に戻しておく
 
 #ifdef TIMER
-time_start = GetTickCount() - time_start;
-printf("write %d.%03d sec\n", time_start / 1000, time_start % 1000);
+time_start = clock() - time_start;
+printf("write %.3f sec\n", (double)time_start / CLOCKS_PER_SEC);
 #endif
 
 	return 0;
diff --git a/source/par2j/gf16.c b/source/par2j/gf16.c
index 754794d..36ad81a 100644
--- a/source/par2j/gf16.c
+++ b/source/par2j/gf16.c
@@ -2795,7 +2795,7 @@ void galois_align_xor(
 void galois_align16_multiply(
 	unsigned char *r1,	// Region to multiply (must be aligned by 16)
 	unsigned char *r2,	// Products go here
-	unsigned int len,	// Byte length (must be multiple of 32)
+	unsigned int len,	// Byte length (must be multiple of 16)
 	int factor)			// Number to multiply by
 {
 	if (factor <= 1){
diff --git a/source/par2j/gf16.h b/source/par2j/gf16.h
index 52e95b8..d797f7e 100644
--- a/source/par2j/gf16.h
+++ b/source/par2j/gf16.h
@@ -6,7 +6,7 @@ extern "C" {
 #endif
 
 
-extern unsigned short *galois_log_table;
+//extern unsigned short *galois_log_table;
 extern unsigned int cpu_flag;
 
 int galois_create_table(void);	// Returns 0 on success, -1 on failure
diff --git a/source/par2j/lib_opencl.c b/source/par2j/lib_opencl.c
index ea4411a..455cd17 100644
--- a/source/par2j/lib_opencl.c
+++ b/source/par2j/lib_opencl.c
@@ -1,5 +1,5 @@
 ﻿// lib_opencl.c
-// Copyright : 2023-11-27 Yutaka Sawada
+// Copyright : 2023-12-26 Yutaka Sawada
 // License : GPL
 
 #ifndef _WIN32_WINNT
@@ -84,7 +84,7 @@ cl_command_queue OpenCL_command = NULL;
 cl_kernel OpenCL_kernel = NULL;
 cl_mem OpenCL_src = NULL, OpenCL_dst = NULL, OpenCL_buf = NULL;
 size_t OpenCL_group_num;
-int OpenCL_method = 0;	// 正=速い機器を選ぶ, 負=遅い機器を選ぶ
+int OpenCL_method = 0;	// 標準では GPU を使わず、動作は自動選択される
 
 API_clCreateBuffer gfn_clCreateBuffer;
 API_clReleaseMemObject gfn_clReleaseMemObject;
@@ -100,7 +100,11 @@ API_clEnqueueNDRangeKernel gfn_clEnqueueNDRangeKernel;
 
 /*
 入力
-OpenCL_method : どのデバイスを選ぶか
+OpenCL_method : どのデバイスや関数を選ぶか
+    0x100 = 速い機器を選ぶ,          0x200 = 遅い機器を選ぶ
+  0x10000 = 1ブロックずつ計算する, 0x20000 = 2ブロックずつ計算しようとする
+  0x40000 = 4-byte memory access,  0x80000 = try 16-byte memory access
+ 0x100000 = CL_MEM_COPY_HOST_PTR, 0x200000 = CL_MEM_USE_HOST_PTR
 unit_size : ブロックの単位サイズ
 src_max : ソース・ブロック個数
 
@@ -111,11 +115,12 @@ OpenCL_method : 動作フラグいろいろ
 */
 
 // 0=成功, 1～エラー番号
-int init_OpenCL(int unit_size, int *src_max)
+int init_OpenCL(unsigned int unit_size, int *src_max)
 {
 	char buf[2048], *p_source;
 	int err = 0, i, j;
-	int gpu_power, count, gpu_flag;
+	int gpu_power, count;
+	int unified_memory;	// non zero = Integrated GPU
 	size_t data_size, alloc_max;
 	//FILE *fp;
 	HRSRC res;
@@ -136,9 +141,10 @@ int init_OpenCL(int unit_size, int *src_max)
 	API_clReleaseProgram fn_clReleaseProgram;
 	API_clCreateKernel fn_clCreateKernel;
 	API_clGetKernelWorkGroupInfo fn_clGetKernelWorkGroupInfo;
+	API_clReleaseKernel fn_clReleaseKernel;
 	cl_int ret;
 	cl_uint num_platforms = 0, num_devices = 0, num_groups, param_value;
-	cl_ulong param_value8, param_value4;
+	cl_ulong param_value8;
 	cl_platform_id platform_id[MAX_DEVICE], selected_platform;	// Intel, AMD, Nvidia などドライバーの提供元
 	cl_device_id device_id[MAX_DEVICE], selected_device;	// CPU や GPU など
 	cl_program program;
@@ -215,6 +221,9 @@ int init_OpenCL(int unit_size, int *src_max)
 	fn_clGetKernelWorkGroupInfo = (API_clGetKernelWorkGroupInfo)GetProcAddress(hLibOpenCL, "clGetKernelWorkGroupInfo");
 	if (fn_clGetKernelWorkGroupInfo == NULL)
 		return err;
+	fn_clReleaseKernel = (API_clReleaseKernel)GetProcAddress(hLibOpenCL, "clReleaseKernel");
+	if (fn_clReleaseKernel == NULL)
+		return err;
 	gfn_clFinish = (API_clFinish)GetProcAddress(hLibOpenCL, "clFinish");
 	if (gfn_clFinish == NULL)
 		return err;
@@ -226,12 +235,10 @@ int init_OpenCL(int unit_size, int *src_max)
 	ret = fn_clGetPlatformIDs(MAX_DEVICE, platform_id, &num_platforms);
 	if (ret != CL_SUCCESS)
 		return (ret << 8) | 10;
-	if (OpenCL_method >= 0){	// 選択する順序と初期値を変える
-		OpenCL_method = 1;
-		gpu_power = 0;
-	} else {
-		OpenCL_method = -1;
+	if (OpenCL_method & 0x200){	// 選択する順序と初期値を変える
 		gpu_power = INT_MIN;
+	} else {
+		gpu_power = 0;
 	}
 	alloc_max = 0;
 
@@ -268,20 +275,17 @@ int init_OpenCL(int unit_size, int *src_max)
 #endif
 
 			// 取得できなくてもエラーにしない
-			param_value = 0;	// CL_DEVICE_HOST_UNIFIED_MEMORY は OpenCL 2.0 以降で非推奨になった
 			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_uint), &param_value, NULL);
+			if (ret == CL_SUCCESS){
+				if (param_value != 0){
 #ifdef DEBUG_OUTPUT
-			if (ret == CL_SUCCESS)
-				printf("HOST_UNIFIED_MEMORY = %d\n", param_value);
-#endif
-			if (param_value != 0)
-				param_value = 1;
-			param_value4 = 0;	// local memory が多い時だけ処理を変える
-			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param_value4, NULL);
-#ifdef DEBUG_OUTPUT
-			if (ret == CL_SUCCESS)
-				printf("LOCAL_MEM_SIZE = %I64d KB\n", param_value4 >> 10);
+					printf("HOST_UNIFIED_MEMORY = %d\n", param_value);
 #endif
+					param_value = 1;
+				}
+			} else {	// CL_DEVICE_HOST_UNIFIED_MEMORY は OpenCL 2.0 以降で非推奨になった
+				param_value = 0;
+			}
 
 			// 取得できない場合はエラーにする
 			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &param_value8, NULL);
@@ -302,7 +306,8 @@ int init_OpenCL(int unit_size, int *src_max)
 #endif
 			// MAX_COMPUTE_UNITS * MAX_WORK_GROUP_SIZE で計算力を測る、外付けGPUなら値を倍にする
 			count = (2 - param_value) * (int)data_size * num_groups;
-			count *= OpenCL_method;	// 符号を変える
+			if (OpenCL_method & 0x200)	// Prefer slower device
+				count *= -1;	// 符号を変える
 			//printf("prev = %d, now = %d\n", gpu_power, count);
 			if ((count > gpu_power) && (data_size >= 256) &&	// 256以上ないとテーブルを作れない
 					(param_value8 / 8 > (cl_ulong)unit_size)){	// CL_DEVICE_MAX_MEM_ALLOC_SIZE に収まるか
@@ -311,9 +316,7 @@ int init_OpenCL(int unit_size, int *src_max)
 				selected_platform = platform_id[i];
 				OpenCL_group_num = num_groups;	// ワークグループ数は COMPUTE_UNITS 数にする
 				alloc_max = (size_t)param_value8;
-				gpu_flag = param_value;	// 0 = discrete GPU, 1 = integrated GPU
-				if (param_value4 >= 32768)
-					gpu_flag |= 2;	// local memory が 32KB 以上あるかどうか
+				unified_memory = param_value;	// 0 = discrete GPU, 1 = integrated GPU
 
 				// AMD や Intel の GPU ではメモリー領域が全体の 1/4 とは限らない
 				ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &param_value8, NULL);
@@ -355,49 +358,6 @@ int init_OpenCL(int unit_size, int *src_max)
 	if (ret != CL_SUCCESS)
 		return (ret << 8) | 12;
 
-	// 計算方式を選択する
-	if ((((cpu_flag & 0x101) == 1) || ((cpu_flag & 0x110) == 0x10)) && (sse_unit == 32)){
-		if (gpu_flag & 2){
-			OpenCL_method = 3;	// local memory が 32KB 以上あれば 16-byte ずつアクセスする
-		} else {
-			OpenCL_method = 2;	// SSSE3 & ALTMAP または AVX2 ならデータの並び替え対応版を使う
-		}
-	} else if (((cpu_flag & 128) != 0) && (sse_unit == 256)){
-		OpenCL_method = 4;	// JIT(SSE2) は bit ごとに上位から 16バイトずつ並ぶ
-		// ローカルのテーブルサイズが異なることに注意
-		// XOR 方式以外は 2KB (4バイト * 256項目 * 2個) 使う
-		// XOR (JIT) は 64バイト (4バイト * 16項目) 使う
-	} else {
-		OpenCL_method = 1;	// 並び替えられてないデータ用
-	}
-
-	// work group 数が必要以上に多い場合は減らす
-	if (OpenCL_method == 2){
-		// work item 一個が 8バイトずつ計算する、256個なら work group ごとに 2KB 担当する
-		data_size = unit_size / 2048;
-	} else if (OpenCL_method == 3){
-		// work item 一個が 32バイトずつ計算する、256個なら work group ごとに 8KB 担当する
-		data_size = unit_size / 8192;
-	} else {
-		// work item 一個が 4バイトずつ計算する、256個なら work group ごとに 1KB 担当する
-		data_size = unit_size / 1024;
-	}
-	if (OpenCL_group_num > data_size){
-		OpenCL_group_num = data_size;
-		printf("Number of work groups is reduced to %zd\n", OpenCL_group_num);
-	}
-
-	// データへのアクセス方法をデバイスによって変える
-	if (gpu_flag & 1){
-		OpenCL_method |= 8;	// Integrated GPU なら CL_MEM_USE_HOST_PTR を使う
-	} else {	// Discrete GPU なら NVIDIA のだけ flag を変える
-		ret = fn_clGetDeviceInfo(selected_device, CL_DEVICE_VERSION, sizeof(buf), buf, NULL);
-		if (ret == CL_SUCCESS){
-			if (strstr(buf, "CUDA") != NULL)
-				OpenCL_method |= 8;	// NVIDIA GPU なら CL_MEM_USE_HOST_PTR を使う
-		}
-	}
-
 	// 最大で何ブロック分のメモリー領域を保持できるのか（ここではまだ確保しない）
 	// 後で実際に確保する量はこれよりも少なくなる
 	count = (int)(alloc_max / unit_size);	// 確保できるメモリー量から逆算する
@@ -409,25 +369,6 @@ int init_OpenCL(int unit_size, int *src_max)
 	printf("src buf : %zd KB (%d blocks), possible\n", data_size >> 10, count);
 #endif
 
-	// 出力先は1ブロック分だけあればいい
-	// CL_MEM_ALLOC_HOST_PTRを使えばpinned memoryになるらしい
-	data_size = unit_size;
-	OpenCL_dst = gfn_clCreateBuffer(OpenCL_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, data_size, NULL, &ret);
-	if (ret != CL_SUCCESS)
-		return (ret << 8) | 13;
-#ifdef DEBUG_OUTPUT
-	printf("dst buf : %zd KB (%zd Bytes), OK\n", data_size >> 10, data_size);
-#endif
-
-	// factor は最大個数分 (src_max個)
-	data_size = sizeof(unsigned short) * (*src_max);
-	OpenCL_buf = gfn_clCreateBuffer(OpenCL_context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
-	if (ret != CL_SUCCESS)
-		return (ret << 8) | 14;
-#ifdef DEBUG_OUTPUT
-	printf("factor buf : %zd Bytes (%d factors), OK\n", data_size, (*src_max));
-#endif
-
 /*
 	// テキスト形式の OpenCL C ソース・コードを読み込む
 	err = 4;
@@ -528,18 +469,208 @@ int init_OpenCL(int unit_size, int *src_max)
 		return (ret << 8) | 21;
 	}
 
-	// カーネル関数を抽出する
-	wsprintfA(buf, "method%d", OpenCL_method & 7);
-	OpenCL_kernel = fn_clCreateKernel(program, buf, &ret);
-	if (ret != CL_SUCCESS)
-		return (ret << 8) | 22;
+	// 計算方式を選択する
+	if ((((cpu_flag & 0x101) == 1) || ((cpu_flag & 0x110) == 0x10)) && (sse_unit == 32)){
+		int select_method;	// SSSE3 & ALTMAP または AVX2 ならデータの並び替え対応版を使う
+		if (OpenCL_method & 0x80000){	// 16-byte and 2 blocks
+			select_method = 12;
+		} else if (OpenCL_method & 0x40000){	// 4-byte and 2 blocks
+			select_method = 10;
+		} else if (OpenCL_method & 0x20000){	// 16-byte
+			select_method = 4;
+		} else if (OpenCL_method & 0x10000){	// 4-byte
+			select_method = 2;
+		} else {	// kernel を作って詳細を確かめる
+			size_t item2, item4, item10, item12;
+			cl_kernel kernel2, kernel4, kernel10, kernel12;
+			item2 = item4 = item10 = item12 = 0;
+			// まずは一番重くて速い奴を調べる
+			wsprintfA(buf, "method%d", 12);
+			kernel12 = fn_clCreateKernel(program, buf, &ret);
+			if (ret == CL_SUCCESS){
+				ret = fn_clGetKernelWorkGroupInfo(kernel12, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item12, NULL);
+				if (ret == CL_SUCCESS){
 #ifdef DEBUG_OUTPUT
-	printf("CreateKernel : %s\n", buf);
+					printf("\nTesting %s\n", buf);
+					printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item12);
 #endif
+				}
+			}
+			if (item12 >= 32){	// 32以上あれば余裕で動くとみなす
+				select_method = 12;
+				OpenCL_kernel = kernel12;
+#ifdef DEBUG_OUTPUT
+				printf("\nSelected method%d\n", select_method);
+#endif
+			} else {	// 他の奴と比較する
+				wsprintfA(buf, "method%d", 2);
+				kernel2 = fn_clCreateKernel(program, buf, &ret);
+				if (ret == CL_SUCCESS){
+					ret = fn_clGetKernelWorkGroupInfo(kernel2, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item2, NULL);
+					if (ret == CL_SUCCESS){
+#ifdef DEBUG_OUTPUT
+						printf("\nTesting %s\n", buf);
+						printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item2);
+#endif
+					}
+				}
+				if (item12 >= item2){
+					select_method = 12;
+					OpenCL_kernel = kernel12;
+					ret = fn_clReleaseKernel(kernel2);
+#ifdef DEBUG_OUTPUT
+					if (ret != CL_SUCCESS)
+						printf("clReleaseKernel : Failed\n");
+					printf("\nSelected method%d\n", select_method);
+#endif
+				} else {
+					ret = fn_clReleaseKernel(kernel12);
+#ifdef DEBUG_OUTPUT
+					if (ret != CL_SUCCESS)
+						printf("clReleaseKernel : Failed\n");
+#endif
+					wsprintfA(buf, "method%d", 10);
+					kernel10 = fn_clCreateKernel(program, buf, &ret);
+					if (ret == CL_SUCCESS){
+						ret = fn_clGetKernelWorkGroupInfo(kernel10, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item10, NULL);
+						if (ret == CL_SUCCESS){
+#ifdef DEBUG_OUTPUT
+							printf("\nTesting %s\n", buf);
+							printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item10);
+#endif
+						}
+					}
+					if (item10 >= item2){
+						select_method = 10;
+						OpenCL_kernel = kernel10;
+						ret = fn_clReleaseKernel(kernel2);
+#ifdef DEBUG_OUTPUT
+						if (ret != CL_SUCCESS)
+							printf("clReleaseKernel : Failed\n");
+						printf("\nSelected method%d\n", select_method);
+#endif
+					} else {
+						wsprintfA(buf, "method%d", 4);
+						kernel4 = fn_clCreateKernel(program, buf, &ret);
+						if (ret == CL_SUCCESS){
+							ret = fn_clGetKernelWorkGroupInfo(kernel4, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item4, NULL);
+							if (ret == CL_SUCCESS){
+#ifdef DEBUG_OUTPUT
+								printf("\nTesting %s\n", buf);
+								printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item4);
+#endif
+							}
+						}
+						if (item4 >= item2){
+							select_method = 4;
+							OpenCL_kernel = kernel4;
+							ret = fn_clReleaseKernel(kernel2);
+#ifdef DEBUG_OUTPUT
+							if (ret != CL_SUCCESS)
+								printf("clReleaseKernel : Failed\n");
+							printf("\nSelected method%d\n", select_method);
+#endif
+						} else {
+							select_method = 2;
+							OpenCL_kernel = kernel2;
+							ret = fn_clReleaseKernel(kernel4);
+#ifdef DEBUG_OUTPUT
+							if (ret != CL_SUCCESS)
+								printf("clReleaseKernel : Failed\n");
+							printf("\nSelected method%d\n", select_method);
+#endif
+						}
+					}
+				}
+			}
+		}
+		OpenCL_method |= select_method;
+	} else if (((cpu_flag & 128) != 0) && (sse_unit == 256)){
+		OpenCL_method |= 16;	// JIT(SSE2) は bit ごとに上位から 16バイトずつ並ぶ
+		// ローカルのテーブルサイズが異なることに注意
+		// XOR 方式以外は 2KB (4バイト * 256項目 * 2個) 使う
+		// XOR (JIT) は 64バイト (4バイト * 16項目) 使う
+	} else {
+		int select_method;	// 並び替えられてないデータ用
+		if (OpenCL_method & 0x40000){	// 4-byte and 2 blocks
+			select_method = 9;
+		} else if (OpenCL_method & 0x10000){	// 4-byte
+			select_method = 1;
+		} else {	// kernel を作って詳細を確かめる
+			size_t item1, item9;
+			cl_kernel kernel1, kernel9;
+			item1 = item9 = 0;
+			// まずは一番重くて速い奴を調べる
+			wsprintfA(buf, "method%d", 9);
+			kernel9 = fn_clCreateKernel(program, buf, &ret);
+			if (ret == CL_SUCCESS){
+				ret = fn_clGetKernelWorkGroupInfo(kernel9, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item9, NULL);
+				if (ret == CL_SUCCESS){
+#ifdef DEBUG_OUTPUT
+					printf("\nTesting %s\n", buf);
+					printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item9);
+#endif
+				}
+			}
+			if (item9 >= 32){	// 32以上あれば余裕で動くとみなす
+				select_method = 9;
+				OpenCL_kernel = kernel9;
+#ifdef DEBUG_OUTPUT
+				printf("\nSelected method%d\n", select_method);
+#endif
+			} else {	// 他の奴と比較する
+				wsprintfA(buf, "method%d", 1);
+				kernel1 = fn_clCreateKernel(program, buf, &ret);
+				if (ret == CL_SUCCESS){
+					ret = fn_clGetKernelWorkGroupInfo(kernel1, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &item1, NULL);
+					if (ret == CL_SUCCESS){
+#ifdef DEBUG_OUTPUT
+						printf("\nTesting %s\n", buf);
+						printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", item1);
+#endif
+					}
+				}
+				if (item9 >= item1){
+					select_method = 9;
+					OpenCL_kernel = kernel9;
+					ret = fn_clReleaseKernel(kernel1);
+#ifdef DEBUG_OUTPUT
+					if (ret != CL_SUCCESS)
+						printf("clReleaseKernel : Failed\n");
+					printf("\nSelected method%d\n", select_method);
+#endif
+				} else {
+					select_method = 1;
+					OpenCL_kernel = kernel1;
+					ret = fn_clReleaseKernel(kernel9);
+#ifdef DEBUG_OUTPUT
+					if (ret != CL_SUCCESS)
+						printf("clReleaseKernel : Failed\n");
+					printf("\nSelected method%d\n", select_method);
+#endif
+				}
+			}
+		}
+		OpenCL_method |= select_method;
+	}
+
+	// カーネル関数を抽出する
+	if (OpenCL_kernel == NULL){
+		wsprintfA(buf, "method%d", OpenCL_method & 31);
+		OpenCL_kernel = fn_clCreateKernel(program, buf, &ret);
+		if (ret != CL_SUCCESS)
+			return (ret << 8) | 22;
+#ifdef DEBUG_OUTPUT
+		printf("CreateKernel : %s\n", buf);
+		ret = fn_clGetKernelWorkGroupInfo(OpenCL_kernel, selected_device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &data_size, NULL);
+		if (ret == CL_SUCCESS)
+			printf("PREFERRED_WORK_GROUP_SIZE_MULTIPLE = %zu\n", data_size);
+#endif
+	}
 
 	// カーネルが実行できる work item 数を調べる
-	ret = fn_clGetKernelWorkGroupInfo(OpenCL_kernel, NULL, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &data_size, NULL);
-	if ((ret == CL_SUCCESS) && (data_size < 256)){	// 最低でも 256以上は必要
+	ret = fn_clGetKernelWorkGroupInfo(OpenCL_kernel, selected_device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &data_size, NULL);
+	if ((ret == CL_SUCCESS) && (data_size < 256)){	// 最低でも 256 以上は必要
 #ifdef DEBUG_OUTPUT
 		printf("KERNEL_WORK_GROUP_SIZE = %zd\n", data_size);
 #endif
@@ -558,6 +689,60 @@ int init_OpenCL(int unit_size, int *src_max)
 		fn_clUnloadCompiler();
 	}
 
+	// work group 数が必要以上に多い場合は減らす
+	if (OpenCL_method & 4){
+		// work item 一個が 32バイトずつ計算する、256個なら work group ごとに 8KB 担当する
+		data_size = unit_size / 8192;
+	} else if (OpenCL_method & 2){
+		// work item 一個が 8バイトずつ計算する、256個なら work group ごとに 2KB 担当する
+		data_size = unit_size / 2048;
+	} else {
+		// work item 一個が 4バイトずつ計算する、256個なら work group ごとに 1KB 担当する
+		data_size = unit_size / 1024;
+	}
+	if (OpenCL_group_num > data_size){
+		OpenCL_group_num = data_size;
+		printf("Number of work groups is reduced to %zd\n", OpenCL_group_num);
+	}
+
+	// データへのアクセス方法をデバイスによって変える
+	if (OpenCL_method & 0x200000){
+		OpenCL_method |= 32;
+	} else if ((OpenCL_method & 0x100000) == 0){
+		if (unified_memory){
+			OpenCL_method |= 32;	// Integrated GPU なら CL_MEM_USE_HOST_PTR を使う
+		} else {	// Discrete GPU でも Nvidia のは動作を変える
+			ret = fn_clGetDeviceInfo(selected_device, CL_DEVICE_VERSION, sizeof(buf), buf, NULL);
+			if (ret == CL_SUCCESS){
+				if (strstr(buf, "CUDA") != NULL)
+					OpenCL_method |= 32;	// NVIDIA GPU なら CL_MEM_USE_HOST_PTR を使う
+			}
+		}
+	}
+
+	// 出力先は1ブロック分だけあればいい
+	// CL_MEM_ALLOC_HOST_PTRを使えばpinned memoryになるらしい
+	data_size = unit_size;
+	if (OpenCL_method & 8)
+		data_size *= 2;	// 2ブロックずつ計算できるように、2倍確保しておく
+	OpenCL_dst = gfn_clCreateBuffer(OpenCL_context, CL_MEM_ALLOC_HOST_PTR, data_size, NULL, &ret);
+	if (ret != CL_SUCCESS)
+		return (ret << 8) | 13;
+#ifdef DEBUG_OUTPUT
+	printf("dst buf : %zd KB (%zd Bytes), OK\n", data_size >> 10, data_size);
+#endif
+
+	// factor は最大個数分 (src_max個)
+	data_size = sizeof(unsigned short) * (*src_max);
+	if (OpenCL_method & 8)
+		data_size *= 2;	// 2ブロックずつ計算できるように、2倍確保しておく
+	OpenCL_buf = gfn_clCreateBuffer(OpenCL_context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
+	if (ret != CL_SUCCESS)
+		return (ret << 8) | 14;
+#ifdef DEBUG_OUTPUT
+	printf("factor buf : %zd Bytes (%d factors), OK\n", data_size, (*src_max));
+#endif
+
 	// カーネル引数を指定する
 	ret = gfn_clSetKernelArg(OpenCL_kernel, 1, sizeof(cl_mem), &OpenCL_dst);
 	if (ret != CL_SUCCESS)
@@ -565,13 +750,12 @@ int init_OpenCL(int unit_size, int *src_max)
 	ret = gfn_clSetKernelArg(OpenCL_kernel, 2, sizeof(cl_mem), &OpenCL_buf);
 	if (ret != CL_SUCCESS)
 		return (ret << 8) | 102;
-	if (ret != CL_SUCCESS)
-		return (ret << 8) | 103;
 
 #ifdef DEBUG_OUTPUT
 	// ワークアイテム数
 	printf("\nMax number of work items = %zd (256 * %zd)\n", OpenCL_group_num * 256, OpenCL_group_num);
 #endif
+	OpenCL_method &= 0xFF;	// 最後に選択設定を消去する
 
 	return 0;
 }
@@ -683,7 +867,7 @@ void info_OpenCL(char *buf, int buf_size)
 // ソース・ブロックをデバイス側にコピーする
 int gpu_copy_blocks(
 	unsigned char *data,	// ブロックのバッファー (境界は 4096にすること)
-	int unit_size,			// 4096の倍数にすること
+	unsigned int unit_size,	// 4096の倍数にすること
 	int src_num)			// 何ブロックをコピーするのか
 {
 	size_t data_size;
@@ -692,7 +876,7 @@ int gpu_copy_blocks(
 
 	// Integrated GPU と Discrete GPU の違いに関係なく、使う分だけ毎回メモリー領域を確保する
 	data_size = (size_t)unit_size * src_num;
-	if (OpenCL_method & 8){	// AMD's APU や Integrated GPU なら ZeroCopy する
+	if (OpenCL_method & 32){	// AMD's APU や Integrated GPU なら ZeroCopy する
 		// 実際に比較してみると GeForce GPU でもメモリー消費量が少なくてコピーが速い
 		// NVIDIA GPU は CL_MEM_USE_HOST_PTR でも VRAM 上にキャッシュするので速いらしい
 		flags = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR;
@@ -719,17 +903,31 @@ int gpu_copy_blocks(
 int gpu_multiply_blocks(
 	int src_num,			// Number of multiplying source blocks
 	unsigned short *mat,	// Matrix of numbers to multiply by
+	unsigned short *mat2,	// Set to calculate 2 blocks at once
 	unsigned char *buf,		// Products go here
-	int len)				// Byte length
+	unsigned int len)		// Byte length
 {
 	unsigned __int64 *vram, *src, *dst;
 	size_t global_size, local_size;
 	cl_int ret;
 
 	// 倍率の配列をデバイス側に書き込む
-	ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, sizeof(short) * src_num, mat, 0, NULL, NULL);
+	if (mat2 == NULL){	// 1ブロック分だけコピーする
+		ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, sizeof(short) * src_num, mat, 0, NULL, NULL);
+	} else {	// 2ブロックずつ計算する場合は、配列のサイズも２倍になる
+		if ((size_t)mat2 == 1){	// アドレスが 1 になることはあり得ないので、識別できる
+			ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, sizeof(short) * src_num * 2, mat, 0, NULL, NULL);
+		} else {	// 2回コピーする
+			size_t data_size = sizeof(short) * src_num;
+			ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, 0, data_size, mat, 0, NULL, NULL);
+			if (ret != CL_SUCCESS)
+				return (ret << 8) | 10;
+			// もう一つの配列は違う場所からコピーする
+			ret = gfn_clEnqueueWriteBuffer(OpenCL_command, OpenCL_buf, CL_FALSE, data_size, data_size, mat2, 0, NULL, NULL);
+		}
+	}
 	if (ret != CL_SUCCESS)
-		return (ret << 8) | 10;
+		return (ret << 8) | 11;
 
 	// 引数を指定する
 	ret = gfn_clSetKernelArg(OpenCL_kernel, 3, sizeof(int), &src_num);
@@ -737,17 +935,17 @@ int gpu_multiply_blocks(
 		return (ret << 8) | 103;
 
 	// カーネル並列実行
-	local_size = 256;	// テーブルやキャッシュのため、work item 数は 256に固定する
+	local_size = 256;	// テーブルやキャッシュのため、work item 数は 256 に固定する
 	global_size = OpenCL_group_num * 256;
-	//printf("group num = %d, global size = %d, local size = 256 \n", OpenCL_group_num, global_size);
+	//printf("group num = %d, global size = %d, local size = %d \n", OpenCL_group_num, global_size, local_size);
 	ret = gfn_clEnqueueNDRangeKernel(OpenCL_command, OpenCL_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
 	if (ret != CL_SUCCESS)
-		return (ret << 8) | 11;
+		return (ret << 8) | 12;
 
 	// 出力内容をホスト側に反映させる
 	vram = gfn_clEnqueueMapBuffer(OpenCL_command, OpenCL_dst, CL_TRUE, CL_MAP_READ, 0, len, 0, NULL, NULL, &ret);
 	if (ret != CL_SUCCESS)
-		return (ret << 8) | 12;
+		return (ret << 8) | 13;
 
 	// 8バイトごとに XOR する (SSE2 で XOR しても速くならず)
 	src = vram;
@@ -762,7 +960,7 @@ int gpu_multiply_blocks(
 	// ホスト側でデータを変更しなくても、clEnqueueMapBufferと対で呼び出さないといけない
 	ret = gfn_clEnqueueUnmapMemObject(OpenCL_command, OpenCL_dst, vram, 0, NULL, NULL);
 	if (ret != CL_SUCCESS)
-		return (ret << 8) | 13;
+		return (ret << 8) | 14;
 
 	return 0;
 }
@@ -775,12 +973,12 @@ int gpu_finish(void)
 	// 全ての処理が終わるのを待つ
 	ret = gfn_clFinish(OpenCL_command);
 	if (ret != CL_SUCCESS)
-		return (ret << 8) | 20;
+		return (ret << 8) | 30;
 
 	if (OpenCL_src != NULL){	// 確保されてる場合は解除する
 		ret = gfn_clReleaseMemObject(OpenCL_src);
 		if (ret != CL_SUCCESS)
-			return (ret << 8) | 21;
+			return (ret << 8) | 31;
 		OpenCL_src = NULL;
 	}
 
diff --git a/source/par2j/lib_opencl.h b/source/par2j/lib_opencl.h
index ddde655..6da24b8 100644
--- a/source/par2j/lib_opencl.h
+++ b/source/par2j/lib_opencl.h
@@ -10,20 +10,21 @@ extern "C" {
 
 extern int OpenCL_method;
 
-int init_OpenCL(int unit_size, int *src_max);
+int init_OpenCL(unsigned int unit_size, int *src_max);
 int free_OpenCL(void);
 void info_OpenCL(char *buf, int buf_size);
 
 int gpu_copy_blocks(
 	unsigned char *data,
-	int unit_size,
+	unsigned int unit_size,
 	int src_num);
 
 int gpu_multiply_blocks(
 	int src_num,			// Number of multiplying source blocks
 	unsigned short *mat,	// Matrix of numbers to multiply by
+	unsigned short *mat2,	// Set to calculate 2 blocks at once
 	unsigned char *buf,		// Products go here
-	int len);				// Byte length
+	unsigned int len);		// Byte length
 
 int gpu_finish(void);
 
diff --git a/source/par2j/list.c b/source/par2j/list.c
index cf826c7..7551ecc 100644
--- a/source/par2j/list.c
+++ b/source/par2j/list.c
@@ -1,5 +1,5 @@
 ﻿// list.c
-// Copyright : 2023-10-15 Yutaka Sawada
+// Copyright : 2023-12-12 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -26,6 +26,11 @@
 
 //#define TIMER // 実験用
 
+#ifdef TIMER
+#include <time.h>
+static double time_sec, time_speed;
+#endif
+
 // recovery set のファイルのハッシュ値を調べる (空のファイルは除く)
 // 0x00 = ファイルが存在して完全である
 // 0x01 = ファイルが存在しない
@@ -296,7 +301,7 @@ int check_file_complete(
 {
 	int i, rv;
 #ifdef TIMER
-unsigned int time_start = GetTickCount();
+clock_t time_start = clock();
 #endif
 
 	printf("\nVerifying Input File   :\n");
@@ -332,14 +337,14 @@ unsigned int time_start = GetTickCount();
 	}
 
 #ifdef TIMER
-time_start = GetTickCount() - time_start;
-printf("\n hash %d.%03d sec", time_start / 1000, time_start % 1000);
-if (time_start > 0){
-	time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072));
-	printf(", %d MB/s\n", time_start);
+time_start = clock() - time_start;
+time_sec = (double)time_start / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = (double)total_file_size / (time_sec * 1048576);
 } else {
-	printf("\n");
+	time_speed = 0;
 }
+printf("\n hash %.3f sec, %.0f MB/s\n", time_sec, time_speed);
 #endif
 	return 0;
 }
@@ -364,7 +369,7 @@ int check_file_complete_multi(
 	HANDLE hSub[MAX_READ_NUM];
 	FILE_CHECK_TH th[MAX_READ_NUM];
 #ifdef TIMER
-unsigned int time_start = GetTickCount();
+clock_t time_start = clock();
 #endif
 
 	memset(hSub, 0, sizeof(HANDLE) * MAX_READ_NUM);
@@ -630,14 +635,14 @@ unsigned int time_start = GetTickCount();
 	}
 
 #ifdef TIMER
-time_start = GetTickCount() - time_start;
-printf("\n hash %d.%03d sec", time_start / 1000, time_start % 1000);
-if (time_start > 0){
-	time_start = (int)((total_file_size * 125) / ((__int64)time_start * 131072));
-	printf(", %d MB/s\n", time_start);
+time_start = clock() - time_start;
+time_sec = (double)time_start / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = (double)total_file_size / (time_sec * 1048576);
 } else {
-	printf("\n");
+	time_speed = 0;
 }
+printf("\n hash %.3f sec, %.0f MB/s\n", time_sec, time_speed);
 #endif
 
 error_end:
diff --git a/source/par2j/md5_crc.c b/source/par2j/md5_crc.c
index ea7e729..f1940ee 100644
--- a/source/par2j/md5_crc.c
+++ b/source/par2j/md5_crc.c
@@ -1,5 +1,5 @@
 ﻿// md5_crc.c
-// Copyright : 2023-10-29 Yutaka Sawada
+// Copyright : 2023-12-12 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -21,7 +21,6 @@
 #include "phmd5.h"
 #include "md5_crc.h"
 
-
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 // バイト配列の MD5 ハッシュ値を求める
@@ -200,8 +199,10 @@ int file_md5_crc32_block(
 //#define TIMER // 実験用
 
 #ifdef TIMER
-static unsigned int time_start, time1_start;
-static unsigned int time_total = 0, time2_total = 0, time3_total = 0;
+#include <time.h>
+static double time_sec, time_speed;
+static clock_t time_start, time1_start;
+static clock_t time_total = 0, time2_total = 0, time3_total = 0;
 #endif
 
 #define MAX_BUF_SIZE	2097152	// ヒープ領域を使う場合の最大サイズ
@@ -224,7 +225,7 @@ int file_hash_crc(
 	HANDLE hFile;
 	OVERLAPPED ol;
 #ifdef TIMER
-time1_start = GetTickCount();
+time1_start = clock();
 #endif
 
 	// ソース・ファイルを開く
@@ -251,11 +252,11 @@ time1_start = GetTickCount();
 	if (file_left < IO_SIZE)
 		read_size = (unsigned int)file_left;
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	off = ReadFile(hFile, buf1, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 	if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 		print_win32_err();
@@ -281,11 +282,11 @@ time2_total += GetTickCount() - time_start;
 			ol.OffsetHigh = (unsigned int)(file_off >> 32);
 			file_off += IO_SIZE;
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 			off = ReadFile(hFile, buf, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 			if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 				print_win32_err();
@@ -301,7 +302,7 @@ time2_total += GetTickCount() - time_start;
 		}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		off = 0;	// チェックサム計算
 		if (block_left > 0){	// 前回足りなかった分を追加する
@@ -338,7 +339,7 @@ time_start = GetTickCount();
 			}
 		}
 #ifdef TIMER
-time3_total += GetTickCount() - time_start;
+time3_total += clock() - time_start;
 #endif
 
 		// 経過表示
@@ -369,16 +370,17 @@ error_end:
 		CloseHandle(ol.hEvent);
 
 #ifdef TIMER
-time_total += GetTickCount() - time1_start;
+time_total += clock() - time1_start;
 if (*prog_now == total_file_size){
-	printf("\nread  %d.%03d sec\n", time2_total / 1000, time2_total % 1000);
-	printf("main  %d.%03d sec\n", time3_total / 1000, time3_total % 1000);
-	if (time_total > 0){
-		time_start = (int)((total_file_size * 125) / ((__int64)time_total * 131072));
+	printf("\nread  %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
+	printf("main  %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
+	time_sec = (double)time_total / CLOCKS_PER_SEC;
+	if (time_sec > 0){
+		time_speed = (double)total_file_size / (time_sec * 1048576);
 	} else {
-		time_start = 0;
+		time_speed = 0;
 	}
-	printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start);
+	printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
 }
 #endif
 	return err;
@@ -403,7 +405,7 @@ int file_hash_crc(
 	HANDLE hFile;
 	OVERLAPPED ol;
 #ifdef TIMER
-time1_start = GetTickCount();
+time1_start = clock();
 #endif
 
 	// ソース・ファイルを開く
@@ -442,11 +444,11 @@ error_retry_read:
 	if (file_left < IO_SIZE)
 		read_size = (unsigned int)file_left;
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	off = ReadFile(hFile, buf1, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 	if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 		print_win32_err();
@@ -536,11 +538,11 @@ error_retry_pause:
 			ol.OffsetHigh = (unsigned int)(file_off >> 32);
 			file_off += IO_SIZE;
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 			off = ReadFile(hFile, buf, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 			if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 				print_win32_err();
@@ -557,7 +559,7 @@ time2_total += GetTickCount() - time_start;
 		}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		off = 0;	// チェックサム計算
 		if (block_left > 0){	// 前回足りなかった分を追加する
@@ -594,7 +596,7 @@ time_start = GetTickCount();
 			}
 		}
 #ifdef TIMER
-time3_total += GetTickCount() - time_start;
+time3_total += clock() - time_start;
 #endif
 
 		// 経過表示
@@ -625,16 +627,17 @@ error_end:
 		CloseHandle(ol.hEvent);
 
 #ifdef TIMER
-time_total += GetTickCount() - time1_start;
+time_total += clock() - time1_start;
 if (*prog_now == total_file_size){
-	printf("\nread  %d.%03d sec\n", time2_total / 1000, time2_total % 1000);
-	printf("main  %d.%03d sec\n", time3_total / 1000, time3_total % 1000);
-	if (time_total > 0){
-		time_start = (int)((total_file_size * 125) / ((__int64)time_total * 131072));
+	printf("\nread  %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
+	printf("main  %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
+	time_sec = (double)time_total / CLOCKS_PER_SEC;
+	if (time_sec > 0){
+		time_speed = (double)total_file_size / (time_sec * 1048576);
 	} else {
-		time_start = 0;
+		time_speed = 0;
 	}
-	printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start);
+	printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
 }
 #endif
 	return err;
@@ -660,7 +663,7 @@ int file_hash_crc(
 	HANDLE hFile;
 	OVERLAPPED ol;
 #ifdef TIMER
-time1_start = GetTickCount();
+time1_start = clock();
 #endif
 
 	// ソース・ファイルを開く
@@ -699,11 +702,11 @@ time1_start = GetTickCount();
 	if (file_left < io_size)
 		read_size = (unsigned int)file_left;
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	off = ReadFile(hFile, buf1, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 	if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 		print_win32_err();
@@ -729,11 +732,11 @@ time2_total += GetTickCount() - time_start;
 			ol.OffsetHigh = (unsigned int)(file_off >> 32);
 			file_off += io_size;
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 			off = ReadFile(hFile, buf, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 			if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 				print_win32_err();
@@ -749,7 +752,7 @@ time2_total += GetTickCount() - time_start;
 		}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		off = 0;	// チェックサム計算
 		if (block_left > 0){	// 前回足りなかった分を追加する
@@ -786,7 +789,7 @@ time_start = GetTickCount();
 			}
 		}
 #ifdef TIMER
-time3_total += GetTickCount() - time_start;
+time3_total += clock() - time_start;
 #endif
 
 		// 経過表示
@@ -819,16 +822,17 @@ error_end:
 		_aligned_free(buf1);
 
 #ifdef TIMER
-time_total += GetTickCount() - time1_start;
+time_total += clock() - time1_start;
 if (*prog_now == total_file_size){
-	printf("\nread  %d.%03d sec\n", time2_total / 1000, time2_total % 1000);
-	printf("main  %d.%03d sec\n", time3_total / 1000, time3_total % 1000);
-	if (time_total > 0){
-		time_start = (int)((total_file_size * 125) / ((__int64)time_total * 131072));
+	printf("\nread  %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
+	printf("main  %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
+	time_sec = (double)time_total / CLOCKS_PER_SEC;
+	if (time_sec > 0){
+		time_speed = (double)total_file_size / (time_sec * 1048576);
 	} else {
-		time_start = 0;
+		time_speed = 0;
 	}
-	printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start);
+	printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
 }
 #endif
 	return err;
@@ -1038,7 +1042,7 @@ int file_hash_check(
 	PHMD5 hash_ctx, block_ctx;
 	OVERLAPPED ol;
 #ifdef TIMER
-time1_start = GetTickCount();
+time1_start = clock();
 #endif
 
 	prog_last = -1;	// 検証中のファイル名を毎回表示する
@@ -1062,11 +1066,11 @@ time1_start = GetTickCount();
 		file_left = file_size - 16384;	// 本来のファイル・サイズまでしか検査しない
 	}
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	off = ReadFile(hFile, buf, len, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 	if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 		print_win32_err();
@@ -1141,11 +1145,11 @@ time2_total += GetTickCount() - time_start;
 	if (file_left < IO_SIZE)
 		read_size = (unsigned int)file_left;
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	off = ReadFile(hFile, buf1, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 	if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 		print_win32_err();
@@ -1168,11 +1172,11 @@ time2_total += GetTickCount() - time_start;
 			ol.OffsetHigh = (unsigned int)(file_off >> 32);
 			file_off += IO_SIZE;
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 			off = ReadFile(hFile, buf, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 			if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 				print_win32_err();
@@ -1187,7 +1191,7 @@ time2_total += GetTickCount() - time_start;
 		}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		if (s_blk != NULL){
 			off = 0;
@@ -1230,7 +1234,7 @@ time_start = GetTickCount();
 			Phmd5Process(&hash_ctx, buf, len);	// MD5 計算
 		}
 #ifdef TIMER
-time3_total += GetTickCount() - time_start;
+time3_total += clock() - time_start;
 #endif
 
 		// 経過表示
@@ -1267,15 +1271,16 @@ error_end:
 		CloseHandle(ol.hEvent);
 
 #ifdef TIMER
-time_total += GetTickCount() - time1_start;
-	printf("\nread  %d.%03d sec\n", time2_total / 1000, time2_total % 1000);
-	printf("main  %d.%03d sec\n", time3_total / 1000, time3_total % 1000);
-	if (time_total > 0){
-		time_start = (int)((file_size * 125) / ((__int64)time_total * 131072));
+time_total += clock() - time1_start;
+	printf("\nread  %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
+	printf("main  %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
+	time_sec = (double)time_total / CLOCKS_PER_SEC;
+	if (time_sec > 0){
+		time_speed = (double)file_size / (time_sec * 1048576);
 	} else {
-		time_start = 0;
+		time_speed = 0;
 	}
-	printf("total %d.%03d sec, %d MB/s\n", time_total / 1000, time_total % 1000, time_start);
+	printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
 #endif
 	return comp_num;
 }
@@ -1536,7 +1541,7 @@ int file_hash_direct(
 	HANDLE hFile;
 	OVERLAPPED ol;
 #ifdef TIMER
-time1_start = GetTickCount();
+time1_start = clock();
 #endif
 
 	prog_last = -1;	// 検証中のファイル名を毎回表示する
@@ -1592,11 +1597,11 @@ time1_start = GetTickCount();
 		file_left = file_size - 16384;	// 本来のファイル・サイズまでしか検査しない
 	}
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	off = ReadFile(hFile, buf, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 	if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 		comp_num = -1;
@@ -1679,11 +1684,11 @@ time2_total += GetTickCount() - time_start;
 		read_size = (read_size + 4095) & ~4095;	// 4KB の倍数にする
 	}
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	off = ReadFile(hFile, buf1, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 	if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 		print_win32_err();
@@ -1710,11 +1715,11 @@ time2_total += GetTickCount() - time_start;
 			ol.OffsetHigh = (unsigned int)(file_off >> 32);
 			file_off += IO_SIZE;
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 			off = ReadFile(hFile, buf, read_size, NULL, &ol);
 #ifdef TIMER
-time2_total += GetTickCount() - time_start;
+time2_total += clock() - time_start;
 #endif
 			if ((off == 0) && (GetLastError() != ERROR_IO_PENDING)){
 				print_win32_err();
@@ -1729,7 +1734,7 @@ time2_total += GetTickCount() - time_start;
 		}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		if (s_blk != NULL){
 			off = 0;
@@ -1771,7 +1776,7 @@ time_start = GetTickCount();
 			Phmd5Process(&hash_ctx, buf, len);	// MD5 計算
 		}
 #ifdef TIMER
-time3_total += GetTickCount() - time_start;
+time3_total += clock() - time_start;
 #endif
 
 		// 経過表示
@@ -1812,10 +1817,16 @@ error_end:
 		_aligned_free(buf1);
 
 #ifdef TIMER
-time_total += GetTickCount() - time1_start;
-	printf("\nread  %d.%03d sec\n", time2_total / 1000, time2_total % 1000);
-	printf("main  %d.%03d sec\n", time3_total / 1000, time3_total % 1000);
-	printf("total %d.%03d sec\n", time_total / 1000, time_total % 1000);
+time_total += clock() - time1_start;
+	printf("\nread  %.3f sec\n", (double)time2_total / CLOCKS_PER_SEC);
+	printf("main  %.3f sec\n", (double)time3_total / CLOCKS_PER_SEC);
+	time_sec = (double)time_total / CLOCKS_PER_SEC;
+	if (time_sec > 0){
+		time_speed = (double)file_size / (time_sec * 1048576);
+	} else {
+		time_speed = 0;
+	}
+	printf("total %.3f sec, %.0f MB/s\n", time_sec, time_speed);
 #endif
 	return comp_num;
 }
diff --git a/source/par2j/par2_cmd.c b/source/par2j/par2_cmd.c
index df433bc..37fdf07 100644
--- a/source/par2j/par2_cmd.c
+++ b/source/par2j/par2_cmd.c
@@ -1,5 +1,5 @@
 ﻿// par2_cmd.c
-// Copyright : 2023-10-15 Yutaka Sawada
+// Copyright : 2023-12-09 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -1479,14 +1479,12 @@ ri= switch_set & 0x00040000
 			} else if (wcsncmp(tmp_p, L"lc", 2) == 0){
 				k = 0;
 				j = 2;
-				while ((j < 2 + 5) && (tmp_p[j] >= '0') && (tmp_p[j] <= '9')){
+				while ((j < 2 + 7) && (tmp_p[j] >= '0') && (tmp_p[j] <= '9')){
 					k = (k * 10) + (tmp_p[j] - '0');
 					j++;
 				}
-				if (k & 256){	// GPU を使う
-					OpenCL_method = 1;	// Faster GPU
-				} else if (k & 512){
-					OpenCL_method = -1;	// Slower GPU
+				if (k & 0x300){	// GPU を使う
+					OpenCL_method = k & 0x003F0300;
 				}
 				if (k & 1024)	// CLMUL と ALTMAP を使わない
 					cpu_flag = (cpu_flag & 0xFFFFFFF7) | 256;
diff --git a/source/par2j/reedsolomon.c b/source/par2j/reedsolomon.c
index 0a91b69..75b15be 100644
--- a/source/par2j/reedsolomon.c
+++ b/source/par2j/reedsolomon.c
@@ -1,5 +1,5 @@
 ﻿// reedsolomon.c
-// Copyright : 2023-10-26 Yutaka Sawada
+// Copyright : 2023-12-12 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -27,6 +27,9 @@
 #include "rs_decode.h"
 #include "reedsolomon.h"
 
+#ifdef TIMER
+#include <time.h>
+#endif
 
 // GPU を使う最小データサイズ (MB 単位)
 // GPU の起動には時間がかかるので、データが小さすぎると逆に遅くなる
@@ -739,7 +742,7 @@ int rs_encode(
 	int err = 0;
 	unsigned int len;
 #ifdef TIMER
-unsigned int time_total = GetTickCount();
+clock_t time_total = clock();
 #endif
 
 	if (galois_create_table()){
@@ -755,7 +758,7 @@ unsigned int time_total = GetTickCount();
 	// パリティ計算用の行列演算の準備をする
 	len = sizeof(unsigned short) * source_num;
 	if (OpenCL_method != 0)
-		len *= 2;	// GPU の作業領域も確保しておく
+		len *= 3;	// GPU の作業領域も確保しておく
 	constant = malloc(len);
 	if (constant == NULL){
 		printf("malloc, %d\n", len);
@@ -799,8 +802,8 @@ unsigned int time_total = GetTickCount();
 		err = encode_method2(file_path, header_buf, rcv_hFile, files, s_blk, p_blk, constant);
 #ifdef TIMER
 	if (err != 1){
-		time_total = GetTickCount() - time_total;
-		printf("total  %d.%03d sec\n", time_total / 1000, time_total % 1000);
+		time_total = clock() - time_total;
+		printf("total  %.3f sec\n", (double)time_total / CLOCKS_PER_SEC);
 	}
 #endif
 
@@ -830,7 +833,7 @@ int rs_encode_1pass(
 	int err = 0;
 	unsigned int len;
 #ifdef TIMER
-unsigned int time_total = GetTickCount();
+clock_t time_total = clock();
 #endif
 
 	if (galois_create_table()){
@@ -841,7 +844,7 @@ unsigned int time_total = GetTickCount();
 	// パリティ計算用の行列演算の準備をする
 	len = sizeof(unsigned short) * source_num;
 	if (OpenCL_method != 0)
-		len *= 2;	// GPU の作業領域も確保しておく
+		len *= 3;	// GPU の作業領域も確保しておく
 	constant = malloc(len);
 	if (constant == NULL){
 		printf("malloc, %d\n", len);
@@ -888,8 +891,8 @@ unsigned int time_total = GetTickCount();
 	if (err < 0){
 		printf("switching to 2-pass processing, %d\n", err);
 	} else if (err != 1){
-		time_total = GetTickCount() - time_total;
-		printf("total  %d.%03d sec\n", time_total / 1000, time_total % 1000);
+		time_total = clock() - time_total;
+		printf("total  %.3f sec\n", (double)time_total / CLOCKS_PER_SEC);
 	}
 #endif
 
@@ -913,7 +916,7 @@ int rs_decode(
 	int err = 0, i, j, k;
 	unsigned int len;
 #ifdef TIMER
-unsigned int time_matrix = 0, time_total = GetTickCount();
+clock_t time_matrix = 0, time_total = clock();
 #endif
 
 	if (galois_create_table()){
@@ -948,7 +951,7 @@ unsigned int time_matrix = 0, time_total = GetTickCount();
 	id = mat + (block_lost * source_num);
 
 #ifdef TIMER
-time_matrix = GetTickCount();
+time_matrix = clock();
 #endif
 	// 復元用の行列を計算する
 	print_progress_text(0, "Computing matrix");
@@ -989,7 +992,7 @@ time_matrix = GetTickCount();
 	//for (i = 0; i < block_lost; i++)
 	//	printf("id[%d] = %d\n", i, id[i]);
 #ifdef TIMER
-time_matrix = GetTickCount() - time_matrix;
+time_matrix = clock() - time_matrix;
 #endif
 
 #ifdef TIMER
@@ -1032,9 +1035,9 @@ time_matrix = GetTickCount() - time_matrix;
 		err = decode_method2(file_path, block_lost, rcv_hFile, files, s_blk, p_blk, mat);
 #ifdef TIMER
 	if (err != 1){
-		time_total = GetTickCount() - time_total;
-		printf("total  %d.%03d sec\n", time_total / 1000, time_total % 1000);
-		printf("matrix %d.%03d sec\n", time_matrix / 1000, time_matrix % 1000);
+		time_total = clock() - time_total;
+		printf("total  %.3f sec\n", (double)time_total / CLOCKS_PER_SEC);
+		printf("matrix %.3f sec\n", (double)time_matrix / CLOCKS_PER_SEC);
 	}
 #endif
 
diff --git a/source/par2j/rs_decode.c b/source/par2j/rs_decode.c
index cf41a88..59dbc18 100644
--- a/source/par2j/rs_decode.c
+++ b/source/par2j/rs_decode.c
@@ -1,5 +1,5 @@
 ﻿// rs_decode.c
-// Copyright : 2023-11-27 Yutaka Sawada
+// Copyright : 2023-12-13 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -28,7 +28,9 @@
 
 
 #ifdef TIMER
-static unsigned int time_start, time_read = 0, time_write = 0, time_calc = 0;
+#include <time.h>
+static double time_sec, time_speed;
+static clock_t time_start, time_read = 0, time_write = 0, time_calc = 0;
 static unsigned int read_count, write_count = 0, skip_count;
 #endif
 
@@ -60,7 +62,7 @@ static DWORD WINAPI thread_decode2(LPVOID lpParameter)
 	RS_TH *th;
 #ifdef TIMER
 unsigned int loop_count2a = 0, loop_count2b = 0;
-unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
+clock_t time_start2, time_encode2a = 0, time_encode2b = 0;
 #endif
 
 	th = (RS_TH *)lpParameter;
@@ -78,7 +80,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	while (th->now < INT_MAX / 2){
 #ifdef TIMER
-time_start2 = GetTickCount();
+time_start2 = clock();
 #endif
 		s_buf = th->buf;
 		factor = th->mat;
@@ -95,7 +97,7 @@ loop_count2a++;
 #endif
 			}
 #ifdef TIMER
-time_encode2a += GetTickCount() - time_start2;
+time_encode2a += clock() - time_start2;
 #endif
 		} else {	// 消失ブロックを部分的に保持する場合
 			// スレッドごとに復元する消失ブロックの chunk を変える
@@ -136,7 +138,7 @@ loop_count2b += src_num;
 #endif
 			}
 #ifdef TIMER
-time_encode2b += GetTickCount() - time_start2;
+time_encode2b += clock() - time_start2;
 #endif
 		}
 		//_mm_sfence();	// メモリーへの書き込みを完了する
@@ -146,19 +148,21 @@ time_encode2b += GetTickCount() - time_start2;
 #ifdef TIMER
 loop_count2b /= chunk_num;	// chunk数で割ってブロック数にする
 printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
-if (time_encode2a > 0){
-	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
+time_sec = (double)time_encode2a / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
 if (loop_count2a > 0)
-	printf(" 1st decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i);
-if (time_encode2b > 0){
-	i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072));
+	printf(" 1st decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed);
+time_sec = (double)time_encode2b / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
-printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i);
+printf(" 2nd decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed);
 #endif
 
 	// 終了処理
@@ -178,7 +182,7 @@ static DWORD WINAPI thread_decode3(LPVOID lpParameter)
 	RS_TH *th;
 #ifdef TIMER
 unsigned int loop_count2a = 0, loop_count2b = 0;
-unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
+clock_t time_start2, time_encode2a = 0, time_encode2b = 0;
 #endif
 
 	th = (RS_TH *)lpParameter;
@@ -197,7 +201,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	while (th->now < INT_MAX / 2){
 #ifdef TIMER
-time_start2 = GetTickCount();
+time_start2 = clock();
 #endif
 		s_buf = th->buf;
 		factor = th->mat;
@@ -214,7 +218,7 @@ loop_count2a++;
 #endif
 			}
 #ifdef TIMER
-time_encode2a += GetTickCount() - time_start2;
+time_encode2a += clock() - time_start2;
 #endif
 		} else {	// 全ての消失ブロックを保持する場合
 			// スレッドごとに復元する消失ブロックの chunk を変える
@@ -250,7 +254,7 @@ loop_count2b += src_num;
 #endif
 			}
 #ifdef TIMER
-time_encode2b += GetTickCount() - time_start2;
+time_encode2b += clock() - time_start2;
 #endif
 		}
 		//_mm_sfence();	// メモリーへの書き込みを完了する
@@ -260,19 +264,21 @@ time_encode2b += GetTickCount() - time_start2;
 #ifdef TIMER
 loop_count2b /= chunk_num;	// chunk数で割ってブロック数にする
 printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
-if (time_encode2a > 0){
-	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
+time_sec = (double)time_encode2a / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
 if (loop_count2a > 0)
-	printf(" 1st decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i);
-if (time_encode2b > 0){
-	i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072));
+	printf(" 1st decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed);
+time_sec = (double)time_encode2b / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
-printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i);
+printf(" 2nd decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed);
 #endif
 
 	// 終了処理
@@ -292,7 +298,8 @@ static DWORD WINAPI thread_decode_gpu(LPVOID lpParameter)
 	HANDLE hRun, hEnd;
 	RS_TH *th;
 #ifdef TIMER
-unsigned int time_start2, time_encode2 = 0, loop_count2 = 0;
+unsigned int loop_count2 = 0;
+clock_t time_start2, time_encode2 = 0;
 #endif
 
 	th = (RS_TH *)lpParameter;
@@ -307,7 +314,7 @@ unsigned int time_start2, time_encode2 = 0, loop_count2 = 0;
 	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	while (th->now < INT_MAX / 2){
 #ifdef TIMER
-time_start2 = GetTickCount();
+time_start2 = clock();
 #endif
 		// GPUはソース・ブロック読み込み中に呼ばれない
 		s_buf = th->buf;
@@ -321,22 +328,58 @@ time_start2 = GetTickCount();
 			InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
 		}
 
-		// スレッドごとに復元する消失ブロックを変える
-		while ((j = InterlockedIncrement(&(th->now))) < block_lost){	// j = ++th_now
-			// 倍率は逆行列から部分的にコピーする
-			i = gpu_multiply_blocks(src_num, factor + source_num * j, g_buf + (size_t)unit_size * j, unit_size);
-			if (i != 0){
-				th->len = i;
-				InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
-				break;
-			}
+		// 一つの GPUスレッドが全ての消失ブロックを処理する
+		if (OpenCL_method & 8){	// ２ブロックずつ計算する
+			// 消失ブロック数が奇数なら、最初の一個だけ別に計算する
+			if (block_lost & 1){
+				InterlockedIncrement(&(th->now));	// 常に j = 0 となる
+
+				// 倍率は逆行列から部分的にコピーする
+				i = gpu_multiply_blocks(src_num, factor, NULL, g_buf, unit_size);
+				if (i != 0){
+					th->len = i;
+					InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
+					break;
+				}
 
 #ifdef TIMER
 loop_count2 += src_num;
 #endif
+			}
+
+			// 残りのブロックは二個ずつ計算する
+			while ((j = InterlockedAdd(&(th->now), 2)) < block_lost){	// th_now += 2, j = th_now
+				j--;	// +2 してるから、最初のブロックは -1 する
+
+				// 倍率は逆行列から部分的に２回コピーする
+				i = gpu_multiply_blocks(src_num, factor + source_num * j, factor + source_num * (j + 1), g_buf + (size_t)unit_size * j, unit_size * 2);
+				if (i != 0){
+					th->len = i;
+					InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
+					break;
+				}
+#ifdef TIMER
+loop_count2 += src_num * 2;
+#endif
+			}
+
+		} else {	// 以前からの１ブロックずつ計算する方式
+			while ((j = InterlockedIncrement(&(th->now))) < block_lost){	// j = ++th_now
+				// 倍率は逆行列から部分的にコピーする（２ブロックずつの場合はブロック数をマイナスにする）
+				i = gpu_multiply_blocks(src_num, factor + source_num * j, NULL, g_buf + (size_t)unit_size * j, unit_size);
+				if (i != 0){
+					th->len = i;
+					InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
+					break;
+				}
+
+#ifdef TIMER
+loop_count2 += src_num;
+#endif
+			}
 		}
 #ifdef TIMER
-time_encode2 += GetTickCount() - time_start2;
+time_encode2 += clock() - time_start2;
 #endif
 		// 最後にVRAMを解放する
 		i = gpu_finish();
@@ -349,12 +392,13 @@ time_encode2 += GetTickCount() - time_start2;
 	}
 #ifdef TIMER
 printf("gpu-thread :\n");
-if (time_encode2 > 0){
-	i = (int)((__int64)loop_count2 * unit_size * 125 / ((__int64)time_encode2 * 131072));
+time_sec = (double)time_encode2 / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2 * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
-printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2 / 1000, time_encode2 % 1000, loop_count2, i);
+printf(" 2nd decode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2, time_speed);
 #endif
 
 	// 終了処理
@@ -430,7 +474,7 @@ int decode_method1(	// ソース・ブロックが一個だけの場合
 	block_off = 0;
 	while (block_off < block_size){
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		// パリティ・ブロックを読み込む
 		len = block_size - block_off;
@@ -447,18 +491,18 @@ time_start = GetTickCount();
 		// パリティ・ブロックのチェックサムを計算する
 		checksum16_altmap(buf, buf + io_size, io_size);
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		// 失われたソース・ブロックを復元する
 		memset(work_buf, 0, unit_size);
 		// factor で割ると元に戻る
 		galois_align_multiply(buf, work_buf, unit_size, galois_divide(1, galois_power(2, id)));
 #ifdef TIMER
-time_calc += GetTickCount() - time_start;
+time_calc += clock() - time_start;
 #endif
 
 		// 経過表示
@@ -472,7 +516,7 @@ time_calc += GetTickCount() - time_start;
 		}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		// 復元されたソース・ブロックのチェックサムを検証する
 		checksum16_return(work_buf, hash, io_size);
@@ -491,7 +535,7 @@ time_start = GetTickCount();
 			goto error_end;
 		}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 
 		block_off += io_size;
@@ -499,9 +543,9 @@ time_write += GetTickCount() - time_start;
 	print_progress_done();	// 末尾ブロックの断片化によっては 100% で完了するとは限らない
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
-printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
-printf("decode %d.%03d sec\n", time_calc / 1000, time_calc % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
+printf("write  %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
+printf("decode %.3f sec\n", (double)time_calc / CLOCKS_PER_SEC);
 #endif
 
 error_end:
@@ -623,7 +667,7 @@ int decode_method2(	// ソース・データを全て読み込む場合
 #ifdef TIMER
 read_count = 0;
 skip_count = 0;
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		last_file = -1;
 		recv_now = 0;	// 何番目の代替ブロックか
@@ -760,7 +804,7 @@ skip_count++;
 			hFile = NULL;
 		}
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
@@ -845,7 +889,7 @@ skip_count++;
 			}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 			// 復元されたブロックを書き込む
 			work_buf = p_buf;
@@ -916,7 +960,7 @@ write_count++;
 				}
 			}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 
 			part_off += part_num;	// 次の消失ブロック位置にする
@@ -930,9 +974,9 @@ time_write += GetTickCount() - time_start;
 	print_progress_done();
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
 j = ((block_size + io_size - 1) / io_size) * block_lost;
-printf("write  %d.%03d sec, count = %d/%d\n", time_write / 1000, time_write % 1000, write_count, j);
+printf("write  %.3f sec, count = %d/%d\n", (double)time_write / CLOCKS_PER_SEC, write_count, j);
 if (prog_num != prog_base)
 	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
@@ -1063,7 +1107,7 @@ int decode_method3(	// 復元するブロックを全て保持できる場合
 
 #ifdef TIMER
 read_count = 0;
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		last_file = -1;
 		for (i = 0; i < read_num; i++){	// スライスを一個ずつ読み込んでメモリー上に配置していく
@@ -1173,7 +1217,7 @@ read_count++;
 			hFile = NULL;
 		}
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
@@ -1238,7 +1282,7 @@ time_read += GetTickCount() - time_start;
 	}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	// 復元されたブロックを書き込む
 	work_buf = p_buf;
@@ -1297,7 +1341,7 @@ time_start = GetTickCount();
 		}
 	}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 	// 最後の書き込みファイルを閉じる
 	CloseHandle(hFile);
@@ -1305,8 +1349,8 @@ time_write += GetTickCount() - time_start;
 	print_progress_done();
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
-printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
+printf("write  %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
 if (prog_num != prog_base)
 	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
@@ -1463,7 +1507,7 @@ int decode_method4(	// 全てのブロックを断片的に保持する場合 (G
 #ifdef TIMER
 read_count = 0;
 skip_count = 0;
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		last_file = -1;
 		recv_now = 0;	// 何番目の代替ブロックか
@@ -1600,7 +1644,7 @@ skip_count++;
 			hFile = NULL;
 		}
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 		memset(g_buf, 0, (size_t)unit_size * block_lost);	// 待機中に GPU用の領域をゼロ埋めしておく
@@ -1845,7 +1889,7 @@ skip_count++;
 			prog_num += th->size * block_lost;
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		// 復元されたブロックを書き込む
 		work_buf = p_buf;
@@ -1918,7 +1962,7 @@ write_count++;
 			}
 		}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 
 		block_off += io_size;
@@ -1929,9 +1973,9 @@ time_write += GetTickCount() - time_start;
 	print_progress_done();
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
 j = ((block_size + io_size - 1) / io_size) * block_lost;
-printf("write  %d.%03d sec, count = %d/%d\n", time_write / 1000, time_write % 1000, write_count, j);
+printf("write  %.3f sec, count = %d/%d\n", (double)time_write / CLOCKS_PER_SEC, write_count, j);
 if (prog_num != prog_base)
 	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
@@ -2096,7 +2140,7 @@ int decode_method5(	// 復元するブロックだけ保持する場合 (GPU対
 
 #ifdef TIMER
 read_count = 0;
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		last_file = -1;
 		for (i = 0; i < read_num; i++){	// スライスを一個ずつ読み込んでメモリー上に配置していく
@@ -2206,7 +2250,7 @@ read_count++;
 			hFile = NULL;
 		}
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 		if (source_off == 0)
@@ -2446,7 +2490,7 @@ time_read += GetTickCount() - time_start;
 	}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	// 復元されたブロックを書き込む
 	work_buf = p_buf;
@@ -2507,7 +2551,7 @@ time_start = GetTickCount();
 		}
 	}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 	// 最後の書き込みファイルを閉じる
 	CloseHandle(hFile);
@@ -2515,8 +2559,8 @@ time_write += GetTickCount() - time_start;
 	print_progress_done();
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
-printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
+printf("write  %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
 if (prog_num != prog_base)
 	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
diff --git a/source/par2j/rs_encode.c b/source/par2j/rs_encode.c
index 969a4a0..63acf9c 100644
--- a/source/par2j/rs_encode.c
+++ b/source/par2j/rs_encode.c
@@ -1,5 +1,5 @@
 ﻿// rs_encode.c
-// Copyright : 2023-11-25 Yutaka Sawada
+// Copyright : 2023-12-18 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -29,7 +29,9 @@
 
 
 #ifdef TIMER
-static unsigned int time_start, time_read = 0, time_write = 0, time_calc = 0;
+#include <time.h>
+static double time_sec, time_speed;
+static clock_t time_start, time_read = 0, time_write = 0, time_calc = 0;
 static unsigned int read_count, skip_count;
 #endif
 
@@ -61,7 +63,7 @@ static DWORD WINAPI thread_encode2(LPVOID lpParameter)
 	RS_TH *th;
 #ifdef TIMER
 unsigned int loop_count2a = 0, loop_count2b = 0;
-unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
+clock_t time_start2, time_encode2a = 0, time_encode2b = 0;
 #endif
 
 	th = (RS_TH *)lpParameter;
@@ -80,7 +82,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	while (th->now < INT_MAX / 2){
 #ifdef TIMER
-time_start2 = GetTickCount();
+time_start2 = clock();
 #endif
 		s_buf = th->buf;
 		src_off = th->off;	// ソース・ブロック番号
@@ -98,7 +100,7 @@ loop_count2a++;
 			}
 
 #ifdef TIMER
-time_encode2a += GetTickCount() - time_start2;
+time_encode2a += clock() - time_start2;
 #endif
 		} else {	// パリティ・ブロックを部分的に保持する場合
 			// スレッドごとに作成するパリティ・ブロックの chunk を変える
@@ -143,7 +145,7 @@ loop_count2b += src_num;
 #endif
 			}
 #ifdef TIMER
-time_encode2b += GetTickCount() - time_start2;
+time_encode2b += clock() - time_start2;
 #endif
 		}
 		//_mm_sfence();	// メモリーへの書き込みを完了する
@@ -153,19 +155,21 @@ time_encode2b += GetTickCount() - time_start2;
 #ifdef TIMER
 loop_count2b /= chunk_num;	// chunk数で割ってブロック数にする
 printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
-if (time_encode2a > 0){
-	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
+time_sec = (double)time_encode2a / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
 if (loop_count2a > 0)
-	printf(" 1st encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i);
-if (time_encode2b > 0){
-	i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072));
+	printf(" 1st encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed);
+time_sec = (double)time_encode2b / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
-printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i);
+printf(" 2nd encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed);
 #endif
 
 	// 終了処理
@@ -185,7 +189,7 @@ static DWORD WINAPI thread_encode3(LPVOID lpParameter)
 	RS_TH *th;
 #ifdef TIMER
 unsigned int loop_count2a = 0, loop_count2b = 0;
-unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
+clock_t time_start2, time_encode2a = 0, time_encode2b = 0;
 #endif
 
 	th = (RS_TH *)lpParameter;
@@ -204,7 +208,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	while (th->now < INT_MAX / 2){
 #ifdef TIMER
-time_start2 = GetTickCount();
+time_start2 = clock();
 #endif
 		s_buf = th->buf;
 		src_off = th->off;	// ソース・ブロック番号
@@ -221,7 +225,7 @@ loop_count2a++;
 #endif
 			}
 #ifdef TIMER
-time_encode2a += GetTickCount() - time_start2;
+time_encode2a += clock() - time_start2;
 #endif
 		} else {	// 全てのパリティ・ブロックを保持する場合
 			// スレッドごとに作成するパリティ・ブロックの chunk を変える
@@ -261,7 +265,7 @@ loop_count2b += src_num;
 #endif
 			}
 #ifdef TIMER
-time_encode2b += GetTickCount() - time_start2;
+time_encode2b += clock() - time_start2;
 #endif
 		}
 		//_mm_sfence();	// メモリーへの書き込みを完了する
@@ -271,19 +275,21 @@ time_encode2b += GetTickCount() - time_start2;
 #ifdef TIMER
 loop_count2b /= chunk_num;	// chunk数で割ってブロック数にする
 printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
-if (time_encode2a > 0){
-	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
+time_sec = (double)time_encode2a / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2a * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
 if (loop_count2a > 0)
-	printf(" 1st encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i);
-if (time_encode2b > 0){
-	i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072));
+	printf(" 1st encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2a, time_speed);
+time_sec = (double)time_encode2b / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2b * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
-printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i);
+printf(" 2nd encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2b, time_speed);
 #endif
 
 	// 終了処理
@@ -303,7 +309,8 @@ static DWORD WINAPI thread_encode_gpu(LPVOID lpParameter)
 	HANDLE hRun, hEnd;
 	RS_TH *th;
 #ifdef TIMER
-unsigned int time_start2, time_encode2 = 0, loop_count2 = 0;
+unsigned int loop_count2 = 0;
+clock_t time_start2, time_encode2 = 0;
 #endif
 
 	th = (RS_TH *)lpParameter;
@@ -320,7 +327,7 @@ unsigned int time_start2, time_encode2 = 0, loop_count2 = 0;
 	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	while (th->now < INT_MAX / 2){
 #ifdef TIMER
-time_start2 = GetTickCount();
+time_start2 = clock();
 #endif
 		// GPUはソース・ブロック読み込み中に呼ばれない
 		s_buf = th->buf;
@@ -335,24 +342,71 @@ time_start2 = GetTickCount();
 		}
 
 		// 一つの GPUスレッドが全てのパリティ・ブロックを処理する
-		while ((j = InterlockedIncrement(&(th->now))) < parity_num){	// j = ++th_now
-			// factor は定数行列の乗数になる
-			for (i = 0; i < src_num; i++)
-				factor[i] = galois_power(constant[src_off + i], first_num + j);
+		if (OpenCL_method & 8){	// ２ブロックずつ計算する
+			// パリティ・ブロック数が奇数なら、最初の一個だけ別に計算する
+			if (parity_num & 1){
+				InterlockedIncrement(&(th->now));	// 常に j = 0 となる
 
-			// VRAM上のソース・ブロックごとにパリティを追加していく
-			i = gpu_multiply_blocks(src_num, factor, g_buf + (size_t)unit_size * j, unit_size);
-			if (i != 0){
-				th->len = i;
-				InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
-				break;
-			}
+				// factor は定数行列の乗数になる
+				for (i = 0; i < src_num; i++)
+					factor[i] = galois_power(constant[src_off + i], first_num);
+
+				// VRAM上のソース・ブロックごとにパリティを追加していく
+				i = gpu_multiply_blocks(src_num, factor, NULL, g_buf, unit_size);
+				if (i != 0){
+					th->len = i;
+					InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
+					break;
+				}
 #ifdef TIMER
 loop_count2 += src_num;
 #endif
-		}
+			}
+
+			// 残りのブロックは二個ずつ計算する
+			while ((j = InterlockedAdd(&(th->now), 2)) < parity_num){	// th_now += 2, j = th_now
+				j--;	// +2 してるから、最初のブロックは -1 する
+
+				// factor は定数行列の乗数になる
+				for (i = 0; i < src_num; i++){
+					int c = constant[src_off + i];	// 同じ定数だけど、何乗するかが異なる
+					factor[i] = galois_power(c, first_num + j);
+					factor[src_num + i] = galois_power(c, first_num + j + 1);
+				}
+
+				// VRAM上のソース・ブロックごとにパリティを追加していく
+				i = gpu_multiply_blocks(src_num, factor, (void *)1, g_buf + (size_t)unit_size * j, unit_size * 2);
+				if (i != 0){
+					th->len = i;
+					InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
+					break;
+				}
 #ifdef TIMER
-time_encode2 += GetTickCount() - time_start2;
+loop_count2 += src_num * 2;
+#endif
+			}
+
+		} else {	// 以前からの１ブロックずつ計算する方式
+			while ((j = InterlockedIncrement(&(th->now))) < parity_num){	// j = ++th_now
+				// factor は定数行列の乗数になる
+				for (i = 0; i < src_num; i++)
+					factor[i] = galois_power(constant[src_off + i], first_num + j);
+
+				// VRAM上のソース・ブロックごとにパリティを追加していく
+				i = gpu_multiply_blocks(src_num, factor, NULL, g_buf + (size_t)unit_size * j, unit_size);
+				if (i != 0){
+					th->len = i;
+					InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
+					break;
+				}
+#ifdef TIMER
+loop_count2 += src_num;
+#endif
+			}
+		}
+
+#ifdef TIMER
+time_encode2 += clock() - time_start2;
 #endif
 		// 最後にVRAMを解放する
 		i = gpu_finish();
@@ -365,12 +419,13 @@ time_encode2 += GetTickCount() - time_start2;
 	}
 #ifdef TIMER
 printf("gpu-thread :\n");
-if (time_encode2 > 0){
-	i = (int)((__int64)loop_count2 * unit_size * 125 / ((__int64)time_encode2 * 131072));
+time_sec = (double)time_encode2 / CLOCKS_PER_SEC;
+if (time_sec > 0){
+	time_speed = ((double)loop_count2 * unit_size) / (time_sec * 1048576);
 } else {
-	i = 0;
+	time_speed = 0;
 }
-printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2 / 1000, time_encode2 % 1000, loop_count2, i);
+printf(" 2nd encode %.3f sec, %d loop, %.0f MB/s\n", time_sec, loop_count2, time_speed);
 #endif
 
 	// 終了処理
@@ -452,7 +507,7 @@ int encode_method1(	// ソース・ブロックが一個だけの場合
 	block_off = 0;
 	while (block_off < block_size){
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		// ソース・ブロックを読み込む
 		len = s_blk[0].size - block_off;
@@ -469,7 +524,7 @@ time_start = GetTickCount();
 		s_blk[0].crc = crc_update(s_blk[0].crc, buf, len);	// without pad
 		checksum16_altmap(buf, buf + io_size, io_size);
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 		// リカバリ・ファイルに書き込むサイズ
@@ -482,13 +537,13 @@ time_read += GetTickCount() - time_start;
 		// パリティ・ブロックごとに
 		for (i = 0; i < parity_num; i++){
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 			memset(work_buf, 0, unit_size);
 			// factor は 2の乗数になる
 			galois_align_multiply(buf, work_buf, unit_size, galois_power(2, first_num + i));
 #ifdef TIMER
-time_calc += GetTickCount() - time_start;
+time_calc += clock() - time_start;
 #endif
 
 			// 経過表示
@@ -502,7 +557,7 @@ time_calc += GetTickCount() - time_start;
 			}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 			// パリティ・ブロックのチェックサムを検証する
 			checksum16_return(work_buf, hash, io_size);
@@ -535,7 +590,7 @@ time_start = GetTickCount();
 				goto error_end;
 			}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 		}
 
@@ -565,7 +620,7 @@ time_write += GetTickCount() - time_start;
 		}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		// 最後に Recovery Slice packet のヘッダーを書き込む
 		for (i = 0; i < parity_num; i++){
@@ -581,14 +636,14 @@ time_start = GetTickCount();
 			}
 		}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 	}
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
-printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
-printf("encode %d.%03d sec\n", time_calc / 1000, time_calc % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
+printf("write  %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
+printf("encode %.3f sec\n", (double)time_calc / CLOCKS_PER_SEC);
 #endif
 
 error_end:
@@ -729,7 +784,7 @@ int encode_method2(	// ソース・データを全て読み込む場合
 #ifdef TIMER
 read_count = 0;
 skip_count = 0;
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		last_file = -1;
 		for (i = 0; i < source_num; i++){
@@ -830,7 +885,7 @@ skip_count++;
 		CloseHandle(hFile);
 		hFile = NULL;
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
@@ -930,7 +985,7 @@ skip_count++;
 			}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 			// パリティ・ブロックを書き込む
 			work_buf = p_buf;
@@ -979,7 +1034,7 @@ time_start = GetTickCount();
 				}
 			}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 
 			part_off += part_num;	// 次のパリティ位置にする
@@ -1025,7 +1080,7 @@ time_write += GetTickCount() - time_start;
 		}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		// 最後に Recovery Slice packet のヘッダーを書き込む
 		for (i = 0; i < parity_num; i++){
@@ -1041,13 +1096,13 @@ time_start = GetTickCount();
 			}
 		}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 	}
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
-printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
+printf("write  %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
 if (prog_num != prog_base)
 	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
@@ -1186,7 +1241,7 @@ int encode_method3(	// パリティ・ブロックを全て保持して、一度
 		src_off = source_off - 1;	// まだ計算して無い印
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		for (i = 0; i < read_num; i++){	// スライスを一個ずつ読み込んでメモリー上に配置していく
 			// ソース・ブロックを読み込む
@@ -1318,7 +1373,7 @@ time_start = GetTickCount();
 			memcpy(common_buf + packet_off + 16, file_md_ctx.hash, 16);
 		}
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
@@ -1393,19 +1448,19 @@ time_read += GetTickCount() - time_start;
 	}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	memcpy(common_buf + common_size, common_buf, common_size);	// 後の半分に前半のをコピーする
 	// 最後にパリティ・ブロックのチェックサムを検証して、リカバリ・ファイルに書き込む
 	err = create_recovery_file_1pass(file_path, recovery_path, packet_limit, block_distri,
 			packet_num, common_buf, common_size, footer_buf, footer_size, rcv_hFile, p_buf, NULL, unit_size);
 #ifdef TIMER
-time_write = GetTickCount() - time_start;
+time_write = clock() - time_start;
 #endif
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
-printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
+printf("write  %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
 if (prog_num != prog_base - prog_write * parity_num)
 	printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num);
 #endif
@@ -1577,7 +1632,7 @@ int encode_method4(	// 全てのブロックを断片的に保持する場合 (G
 #ifdef TIMER
 read_count = 0;
 skip_count = 0;
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		last_file = -1;
 		for (i = 0; i < source_num; i++){
@@ -1678,7 +1733,7 @@ skip_count++;
 		CloseHandle(hFile);
 		hFile = NULL;
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 		memset(g_buf, 0, (size_t)unit_size * parity_num);	// 待機中に GPU用の領域をゼロ埋めしておく
@@ -1931,7 +1986,7 @@ skip_count++;
 			prog_num += th->size * parity_num;
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		// パリティ・ブロックを書き込む
 		work_buf = p_buf;
@@ -1982,7 +2037,7 @@ time_start = GetTickCount();
 			}
 		}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 
 		block_off += io_size;
@@ -2025,7 +2080,7 @@ time_write += GetTickCount() - time_start;
 		}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		// 最後に Recovery Slice packet のヘッダーを書き込む
 		for (i = 0; i < parity_num; i++){
@@ -2041,13 +2096,13 @@ time_start = GetTickCount();
 			}
 		}
 #ifdef TIMER
-time_write += GetTickCount() - time_start;
+time_write += clock() - time_start;
 #endif
 	}
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
-printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
+printf("write  %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
 if (prog_num != prog_base)
 	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
@@ -2220,7 +2275,7 @@ int encode_method5(	// ソース・ブロックの一部とパリティ・ブロ
 		src_off = source_off - 1;	// まだ計算して無い印
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 		for (i = 0; i < read_num; i++){	// スライスを一個ずつ読み込んでメモリー上に配置していく
 			// ソース・ブロックを読み込む
@@ -2351,7 +2406,7 @@ time_start = GetTickCount();
 			memcpy(common_buf + packet_off + 16, file_md_ctx.hash, 16);
 		}
 #ifdef TIMER
-time_read += GetTickCount() - time_start;
+time_read += clock() - time_start;
 #endif
 
 		if (source_off == 0)
@@ -2589,19 +2644,19 @@ time_read += GetTickCount() - time_start;
 	}
 
 #ifdef TIMER
-time_start = GetTickCount();
+time_start = clock();
 #endif
 	memcpy(common_buf + common_size, common_buf, common_size);	// 後の半分に前半のをコピーする
 	// 最後にパリティ・ブロックのチェックサムを検証して、リカバリ・ファイルに書き込む
 	err = create_recovery_file_1pass(file_path, recovery_path, packet_limit, block_distri,
 			packet_num, common_buf, common_size, footer_buf, footer_size, rcv_hFile, p_buf, g_buf, unit_size);
 #ifdef TIMER
-time_write = GetTickCount() - time_start;
+time_write = clock() - time_start;
 #endif
 
 #ifdef TIMER
-printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
-printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+printf("read   %.3f sec\n", (double)time_read / CLOCKS_PER_SEC);
+printf("write  %.3f sec\n", (double)time_write / CLOCKS_PER_SEC);
 if (prog_num != prog_base - prog_write * parity_num)
 	printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num);
 #endif
diff --git a/source/par2j/source.cl b/source/par2j/source.cl
index 11279d3..d5af15d 100644
--- a/source/par2j/source.cl
+++ b/source/par2j/source.cl
@@ -1,10 +1,14 @@
 void calc_table(__local uint *mtab, int id, int factor)
 {
-	int i, sum = 0;
+	int i, sum, mask;
 
-	for (i = 0; i < 8; i++){
-		sum = (id & (1 << i)) ? (sum ^ factor) : sum;
-		factor = (factor & 0x8000) ? ((factor << 1) ^ 0x1100B) : (factor << 1);
+	mask = (id & 1) ? 0xFFFF : 0;
+	sum = mask & factor;
+	for (i = 1; i < 8; i++){
+		mask = (factor & 0x8000) ? 0x1100B : 0;
+		factor = (factor << 1) ^ mask;
+		mask = (id & (1 << i)) ? 0xFFFF : 0;
+		sum ^= mask & factor;
 	}
 	mtab[id] = sum;
 
@@ -14,6 +18,32 @@ void calc_table(__local uint *mtab, int id, int factor)
 	mtab[id + 256] = sum;
 }
 
+void calc_table2(__local uint *mtab, int id, int factor, int factor2)
+{
+	int i, sum, sum2, mask;
+
+	mask = (id & 1) ? 0xFFFF : 0;
+	sum = mask & factor;
+	sum2 = mask & factor2;
+	for (i = 1; i < 8; i++){
+		mask = (factor & 0x8000) ? 0x1100B : 0;
+		factor = (factor << 1) ^ mask;
+		mask = (factor2 & 0x8000) ? 0x1100B : 0;
+		factor2 = (factor2 << 1) ^ mask;
+		mask = (id & (1 << i)) ? 0xFFFF : 0;
+		sum ^= mask & factor;
+		sum2 ^= mask & factor2;
+	}
+	mtab[id] = sum | (sum2 << 16);
+
+	sum = (sum << 4) ^ (((sum << 16) >> 31) & 0x88058) ^ (((sum << 17) >> 31) & 0x4402C) ^ (((sum << 18) >> 31) & 0x22016) ^ (((sum << 19) >> 31) & 0x1100B);
+	sum = (sum << 4) ^ (((sum << 16) >> 31) & 0x88058) ^ (((sum << 17) >> 31) & 0x4402C) ^ (((sum << 18) >> 31) & 0x22016) ^ (((sum << 19) >> 31) & 0x1100B);
+	sum2 = (sum2 << 4) ^ (((sum2 << 16) >> 31) & 0x88058) ^ (((sum2 << 17) >> 31) & 0x4402C) ^ (((sum2 << 18) >> 31) & 0x22016) ^ (((sum2 << 19) >> 31) & 0x1100B);
+	sum2 = (sum2 << 4) ^ (((sum2 << 16) >> 31) & 0x88058) ^ (((sum2 << 17) >> 31) & 0x4402C) ^ (((sum2 << 18) >> 31) & 0x22016) ^ (((sum2 << 19) >> 31) & 0x1100B);
+
+	mtab[id + 256] = sum | (sum2 << 16);
+}
+
 __kernel void method1(
 	__global uint *src,
 	__global uint *dst,
@@ -31,6 +61,7 @@ __kernel void method1(
 		dst[i] = 0;
 
 	for (blk = 0; blk < blk_num; blk++){
+		barrier(CLK_LOCAL_MEM_FENCE);
 		calc_table(mtab, table_id, factors[blk]);
 		barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -42,7 +73,6 @@ __kernel void method1(
 			dst[i] ^= sum;
 		}
 		src += BLK_SIZE;
-		barrier(CLK_LOCAL_MEM_FENCE);
 	}
 }
 
@@ -65,6 +95,7 @@ __kernel void method2(
 	}
 
 	for (blk = 0; blk < blk_num; blk++){
+		barrier(CLK_LOCAL_MEM_FENCE);
 		calc_table(mtab, table_id, factors[blk]);
 		barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -82,11 +113,10 @@ __kernel void method2(
 			dst[pos + 4] ^= ((sum1 & 0xFF00FF00) >> 8) | (sum2 & 0xFF00FF00);
 		}
 		src += BLK_SIZE;
-		barrier(CLK_LOCAL_MEM_FENCE);
 	}
 }
 
-__kernel void method3(
+__kernel void method4(
 	__global uint4 *src,
 	__global uint4 *dst,
 	__global ushort *factors,
@@ -106,6 +136,7 @@ __kernel void method3(
 	}
 
 	for (blk = 0; blk < blk_num; blk++){
+		barrier(CLK_LOCAL_MEM_FENCE);
 		calc_table(mtab, table_id, factors[blk]);
 		barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -124,11 +155,140 @@ __kernel void method3(
 			dst[i + 1] ^= as_uint4((uchar16)(r0.y, r0.w, r1.y, r1.w, r2.y, r2.w, r3.y, r3.w, r4.y, r4.w, r5.y, r5.w, r6.y, r6.w, r7.y, r7.w));
 		}
 		src += BLK_SIZE / 4;
-		barrier(CLK_LOCAL_MEM_FENCE);
 	}
 }
 
-__kernel void method4(
+__kernel void method9(
+	__global uint *src,
+	__global uint *dst,
+	__global ushort *factors,
+	int blk_num)
+{
+	__local uint mtab[512];
+	int i, blk;
+	uint v, sum, sum2;
+	const int work_id = get_global_id(0);
+	const int work_size = get_global_size(0);
+	const int table_id = get_local_id(0);
+
+	for (i = work_id; i < BLK_SIZE; i += work_size){
+		dst[i] = 0;
+		dst[i + BLK_SIZE] = 0;
+	}
+
+	for (blk = 0; blk < blk_num; blk++){
+		barrier(CLK_LOCAL_MEM_FENCE);
+		calc_table2(mtab, table_id, factors[blk], factors[blk_num + blk]);
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		for (i = work_id; i < BLK_SIZE; i += work_size){
+			v = src[i];
+			sum  = mtab[(uchar)v] ^ mtab[256 + (uchar)(v >> 8)];
+			sum2 = mtab[(uchar)(v >> 16)] ^ mtab[256 + (v >> 24)];
+			dst[i] ^= (sum & 0xFFFF) | (sum2 << 16);
+			dst[i + BLK_SIZE] ^= (sum >> 16) | (sum2 & 0xFFFF0000);
+		}
+		src += BLK_SIZE;
+	}
+}
+
+__kernel void method10(
+	__global uint *src,
+	__global uint *dst,
+	__global ushort *factors,
+	int blk_num)
+{
+	__local uint mtab[512];
+	int i, blk, pos;
+	uint lo, hi, sum1, sum2, sum3, sum4;
+	const int work_id = get_global_id(0) * 2;
+	const int work_size = get_global_size(0) * 2;
+	const int table_id = get_local_id(0);
+
+	for (i = work_id; i < BLK_SIZE; i += work_size){
+		dst[i    ] = 0;
+		dst[i + 1] = 0;
+		dst[i + BLK_SIZE    ] = 0;
+		dst[i + BLK_SIZE + 1] = 0;
+	}
+
+	for (blk = 0; blk < blk_num; blk++){
+		barrier(CLK_LOCAL_MEM_FENCE);
+		calc_table2(mtab, table_id, factors[blk], factors[blk_num + blk]);
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		for (i = work_id; i < BLK_SIZE; i += work_size){
+			pos = (i & ~7) + ((i & 7) >> 1);
+			lo = src[pos    ];
+			hi = src[pos + 4];
+			sum1 = mtab[(uchar)lo] ^ mtab[256 + (uchar)hi];
+			sum2 = mtab[(uchar)(lo >> 8)] ^ mtab[256 + (uchar)(hi >> 8)];
+			sum3 = mtab[(uchar)(lo >> 16)] ^ mtab[256 + (uchar)(hi >> 16)];
+			sum4 = mtab[lo >> 24] ^ mtab[256 + (hi >> 24)];
+			dst[pos    ] ^= (sum1 & 0xFF) | ((sum2 & 0xFF) << 8) | ((sum3 & 0xFF) << 16) | (sum4 << 24);
+			dst[pos + 4] ^= ((sum1 >> 8) & 0xFF) | (sum2 & 0xFF00) | ((sum3 & 0xFF00) << 8) | ((sum4 & 0xFF00) << 16);
+			dst[pos + BLK_SIZE    ] ^= ((sum1 >> 16) & 0xFF) | ((sum2 >> 8) & 0xFF00) | (sum3 & 0xFF0000) | ((sum4 & 0xFF0000) << 8);
+			dst[pos + BLK_SIZE + 4] ^= (sum1 >> 24) | ((sum2 >> 16) & 0xFF00) | ((sum3 >> 8) & 0xFF0000) | (sum4 & 0xFF000000);
+		}
+		src += BLK_SIZE;
+	}
+}
+
+__kernel void method12(
+	__global uint4 *src,
+	__global uint4 *dst,
+	__global ushort *factors,
+	int blk_num)
+{
+	__local uint mtab[512];
+	int i, blk;
+	uchar4 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB, rC, rD, rE, rF;
+	uchar16 lo, hi;
+	const int work_id = get_global_id(0) * 2;
+	const int work_size = get_global_size(0) * 2;
+	const int table_id = get_local_id(0);
+
+	for (i = work_id; i < BLK_SIZE / 4; i += work_size){
+		dst[i    ] = 0;
+		dst[i + 1] = 0;
+		dst[i + BLK_SIZE / 4    ] = 0;
+		dst[i + BLK_SIZE / 4 + 1] = 0;
+	}
+
+	for (blk = 0; blk < blk_num; blk++){
+		barrier(CLK_LOCAL_MEM_FENCE);
+		calc_table2(mtab, table_id, factors[blk], factors[blk_num + blk]);
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		for (i = work_id; i < BLK_SIZE / 4; i += work_size){
+			lo = as_uchar16(src[i    ]);
+			hi = as_uchar16(src[i + 1]);
+			r0 = as_uchar4(mtab[lo.s0] ^ mtab[256 + hi.s0]);
+			r1 = as_uchar4(mtab[lo.s1] ^ mtab[256 + hi.s1]);
+			r2 = as_uchar4(mtab[lo.s2] ^ mtab[256 + hi.s2]);
+			r3 = as_uchar4(mtab[lo.s3] ^ mtab[256 + hi.s3]);
+			r4 = as_uchar4(mtab[lo.s4] ^ mtab[256 + hi.s4]);
+			r5 = as_uchar4(mtab[lo.s5] ^ mtab[256 + hi.s5]);
+			r6 = as_uchar4(mtab[lo.s6] ^ mtab[256 + hi.s6]);
+			r7 = as_uchar4(mtab[lo.s7] ^ mtab[256 + hi.s7]);
+			r8 = as_uchar4(mtab[lo.s8] ^ mtab[256 + hi.s8]);
+			r9 = as_uchar4(mtab[lo.s9] ^ mtab[256 + hi.s9]);
+			rA = as_uchar4(mtab[lo.sa] ^ mtab[256 + hi.sa]);
+			rB = as_uchar4(mtab[lo.sb] ^ mtab[256 + hi.sb]);
+			rC = as_uchar4(mtab[lo.sc] ^ mtab[256 + hi.sc]);
+			rD = as_uchar4(mtab[lo.sd] ^ mtab[256 + hi.sd]);
+			rE = as_uchar4(mtab[lo.se] ^ mtab[256 + hi.se]);
+			rF = as_uchar4(mtab[lo.sf] ^ mtab[256 + hi.sf]);
+			dst[i    ] ^= as_uint4((uchar16)(r0.x, r1.x, r2.x, r3.x, r4.x, r5.x, r6.x, r7.x, r8.x, r9.x, rA.x, rB.x, rC.x, rD.x, rE.x, rF.x));
+			dst[i + 1] ^= as_uint4((uchar16)(r0.y, r1.y, r2.y, r3.y, r4.y, r5.y, r6.y, r7.y, r8.y, r9.y, rA.y, rB.y, rC.y, rD.y, rE.y, rF.y));
+			dst[i + BLK_SIZE / 4    ] ^= as_uint4((uchar16)(r0.z, r1.z, r2.z, r3.z, r4.z, r5.z, r6.z, r7.z, r8.z, r9.z, rA.z, rB.z, rC.z, rD.z, rE.z, rF.z));
+			dst[i + BLK_SIZE / 4 + 1] ^= as_uint4((uchar16)(r0.w, r1.w, r2.w, r3.w, r4.w, r5.w, r6.w, r7.w, r8.w, r9.w, rA.w, rB.w, rC.w, rD.w, rE.w, rF.w));
+		}
+		src += BLK_SIZE / 4;
+	}
+}
+
+__kernel void method16(
 	__global uint *src,
 	__global uint *dst,
 	__global ushort *factors,
@@ -136,7 +296,7 @@ __kernel void method4(
 {
 	__local int table[16];
 	__local uint cache[256];
-	int i, j, blk, pos, sht, mask;
+	int i, j, blk, pos, mask, tmp;
 	uint sum;
 	const int work_id = get_global_id(0);
 	const int work_size = get_global_size(0);
@@ -146,11 +306,12 @@ __kernel void method4(
 
 	for (blk = 0; blk < blk_num; blk++){
 		if (get_local_id(0) == 0){
-			pos = factors[blk] << 16;
-			table[0] = pos;
+			tmp = factors[blk];
+			table[0] = tmp;
 			for (j = 1; j < 16; j++){
-				pos = (pos << 1) ^ ((pos >> 31) & 0x100B0000);
-				table[j] = pos;
+				mask = (tmp & 0x8000) ? 0x1100B : 0;
+				tmp = (tmp << 1) ^ mask;
+				table[j] = tmp;
 			}
 		}
 		barrier(CLK_LOCAL_MEM_FENCE);
@@ -161,10 +322,11 @@ __kernel void method4(
 			barrier(CLK_LOCAL_MEM_FENCE);
 
 			sum = 0;
-			sht = (i & 60) >> 2;
+			tmp = (i & 60) >> 2;
+			tmp = 0x8000 >> tmp;
 			pos &= ~60;
 			for (j = 15; j >= 0; j--){
-				mask = (table[j] << sht) >> 31;
+				mask = (table[j] & tmp) ? 0xFFFFFFFF : 0;
 				sum ^= mask & cache[pos];
 				pos += 4;
 			}