diff --git a/source/par2j/Command_par2j.txt b/source/par2j/Command_par2j.txt
index f80930d..1485ecc 100644
--- a/source/par2j/Command_par2j.txt
+++ b/source/par2j/Command_par2j.txt
@@ -1,4 +1,4 @@
-[ par2j.exe - version 1.3.2.8 or later ]
+[ par2j.exe - version 1.3.3.0 or later ]
 
 Type "par2j.exe" to see version, test integrity, and show usage below.
 
@@ -359,19 +359,22 @@ the protected archive file is made in the directory.
  /lc :
  Set this, if you want to set number of using threads for Multi-Core CPU,
 or want to disable extra feature. (SSE2 is always used.)
-The format is "/lc#", # is from 1 to 11 as the number of using threads,
-12 to use quarter number of physical Cores,
-13 to use half of physical Cores,
-14 to use 3/4 number of physical Cores,
-15 to use the number of physical Cores (disable Hyper Threading),
-or 15 to use one less number of physical Cores on CPU with 6 or more physical Cores.
-Without this option (or /lc0),
-it uses the number of physical Cores on CPU with 6 or more physical Cores,
-or one more threads on CPU with Hyper Threading and 5 or less physical Cores.
+The format is "/lc#" (# is from 1 to 32 as the number of using threads).
 
- You may set additional combinations; +16 to disable SSSE3,
-+128 to disable CLMUL, +256 to disable JIT, +512 to disable AVX2,
-+32 or +64 (slower device) to enable GPU acceleration.
+ It's possible to set by rate as following. (It's /lc0 by default.)
+251: It uses quarter number of physical Cores.
+252: It uses half of physical Cores.
+253: It uses 3/4 number of physical Cores.
+254: It uses one less threads than number of physical Cores.
+  0: It uses the number of physical Cores.
+255: It uses one more threads than number of physical Cores.
+
+ You may set additional combinations;
++1024 to disable CLMUL (and use old SSSE3 code), 
++2048 to disable JIT (for SSE2), 
++4096 to disable SSSE3,
++8192 to disable AVX2,
++256 or +512 (slower device) to enable GPU acceleration.
 
  for example,  /lc1 to use single Core, /lc45 to use half Cores and GPU
 
diff --git a/source/par2j/common2.c b/source/par2j/common2.c
index e133a89..f3697da 100644
--- a/source/par2j/common2.c
+++ b/source/par2j/common2.c
@@ -1,5 +1,5 @@
 ﻿// common2.c
-// Copyright : 2023-03-14 Yutaka Sawada
+// Copyright : 2023-09-23 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -1849,8 +1849,9 @@ int sqrt32(int num)
 
 int cpu_num = 1;	// CPU/Core 個数が制限されてる場合は、上位に本来の数を置く
 // /arch:SSE2, +1=SSSE3, +2=SSE4.1, +4=SSE4.2, +8=CLMUL, +16=AVX2, +128=JIT(SSE2), +256=Old
+// 上位 16-bit = L2 cache サイズから計算した制限サイズ
 unsigned int cpu_flag = 0;
-unsigned int cpu_cache = 0;	// 上位 16-bit = L2 cache * 2, 下位 16-bit = L3 cache
+unsigned int cpu_cache = 0;	// 上位 16-bit = L3 cache の制限サイズ, 下位 16-bit = 同時処理数
 unsigned int memory_use = 0;	// メモリー使用量 0=auto, 1～7 -> 1/8 ～ 7/8
 
 static int count_bit(DWORD_PTR value)
@@ -1869,7 +1870,7 @@ static int count_bit(DWORD_PTR value)
 void check_cpu(void)
 {
 	int core_count = 0, use_count;
-	unsigned int CPUInfo[4];
+	unsigned int CPUInfo[4], limit_size = 0;
 	unsigned int returnLength, byteOffset;
 	DWORD_PTR ProcessAffinityMask, SystemAffinityMask; // 32-bit なら 4バイト、64-bit なら 8バイト整数
 	PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL, ptr;
@@ -2006,42 +2007,52 @@ void check_cpu(void)
 		//printf("Number of available physical processor cores: %d\n", core_count);
 		if (cache3_size > 0){
 			//printf("L3 cache: %d KB (%d way)\n", cache3_size >> 10 , cache3_way);
-			cache3_size /= cache3_way;	// set-associative のサイズにする
-			if (cache3_size < 131072)
-				cache3_size = 128 << 10;	// 128 KB 以上にする
+			cpu_cache = cache3_size / cache3_way;	// set-associative のサイズにする
+			if (cpu_cache < 131072)
+				cpu_cache = 128 << 10;	// 128 KB 以上にする
 		}
 		if (cache2_size > 0){
 			//printf("L2 cache: %d KB (%d way)\n", cache2_size >> 10, cache2_way);
-			cache2_size /= cache2_way;	// set-associative のサイズにする
-			if (cache2_size < 32768)
-				cache2_size = 32 << 10;	// 32 KB 以上にする
-			//printf("Limit size of Cache Blocking: %d KB\n", cache2_size >> 10);
-			cpu_cache = cache2_size | (cache3_size >> 17);
+			limit_size = cache2_size / cache2_way;	// set-associative のサイズにする
+			if (limit_size < 65536)
+				limit_size = 64 << 10;	// 64 KB 以上にする
+			// 同時処理数を決める
+			if (cache2_way >= 16){
+				returnLength = cache2_way / 2;	// L2 cache の分割数が多い場合は、その半分にする
+			} else {
+				returnLength = 0;
+			}
+			if (cache3_size > 0){	// L2 cache に対する L3 cache のサイズの倍率にする
+				byteOffset = cache3_size / cache2_size;
+				if (returnLength < byteOffset){
+					returnLength = byteOffset;
+					if (cache2_way >= cache3_way)	// L2 cache の分割数が L3 cache 以上なら 1.5倍にする
+						returnLength += returnLength / 2;
+				}
+			}
+			cpu_cache |= returnLength & 0x1FFFF;
 		}
 	}
 
-	if (cpu_cache == 0)	// キャッシュ・サイズが不明なら、128 KB にする
-		cpu_cache = 128 << 10;
+	if (limit_size == 0)	// キャッシュ・サイズが不明なら、128 KB にする
+		limit_size = 128 << 10;
+	//printf("Limit size of Cache Blocking: %d KB\n", limit_size >> 10);
+	// cpu_flag の上位 16-bit にキャッシュの制限サイズを置く
+	cpu_flag |= limit_size & 0xFFFF0000;	// 64 KB 未満は無視する
+
 	if (core_count == 0){	// 物理コア数が不明なら、論理コア数と同じにする
 		core_count = cpu_num;
 		use_count = cpu_num;
-	} else if (core_count < cpu_num){	// 物理コア数が共有されてるなら
-		if (core_count >= 6){			// 6 コア以上ならそれ以上増やさない
-			use_count = core_count;
-		} else {	// 2~5 コアなら 1個だけ増やす
-			use_count = core_count + 1;
-		}
+	} else if (core_count < cpu_num){	// 物理コアが共有されてるなら
+		use_count = core_count;	// 物理コア数と同じにする
 	} else {
-		use_count = core_count;
+		use_count = cpu_num;	// 論理コア数と同じにする
 	}
 	if (use_count > MAX_CPU)	// 利用するコア数が実装上の制限を越えないようにする
 		use_count = MAX_CPU;
 	//printf("Core count: logical, physical, use = %d, %d, %d\n", cpu_num, core_count, use_count);
 	// 上位に論理コア数と物理コア数、下位に利用するコア数を配置する
 	cpu_num = (cpu_num << 24) | (core_count << 16) | use_count;
-
-	// cpu_flag の上位 17-bit にキャッシュの制限サイズを置く
-	cpu_flag |= cpu_cache & 0xFFFF8000;	// 32 KB 未満は無視する
 }
 
 // OS が 32-bit か 64-bit かを調べる
diff --git a/source/par2j/common2.h b/source/par2j/common2.h
index 90b7a78..583c8db 100644
--- a/source/par2j/common2.h
+++ b/source/par2j/common2.h
@@ -6,11 +6,11 @@ extern "C" {
 #endif
 
 #ifndef _WIN64	// 32-bit 版なら
-#define MAX_CPU			8			// 32-bit 版は少なくしておく
+#define MAX_CPU			16			// 32-bit 版は少なくしておく
 #define MAX_MEM_SIZE	0x7F000000	// 確保するメモリー領域の最大値 2032MB
 #define MAX_MEM_SIZE32	0x50000000	// 32-bit OS で確保するメモリー領域の最大値 1280MB
 #else
-#define MAX_CPU			16			// 最大 CPU/Core 個数 (スレッド本数)
+#define MAX_CPU			32			// 最大 CPU/Core 個数 (スレッド本数)
 #endif
 
 #define MAX_LEN			1024		// ファイル名の最大文字数 (末尾のNULL文字も含む)
diff --git a/source/par2j/create.c b/source/par2j/create.c
index 5fb587c..bb24f7a 100644
--- a/source/par2j/create.c
+++ b/source/par2j/create.c
@@ -1,5 +1,5 @@
 ﻿// create.c
-// Copyright : 2022-02-16 Yutaka Sawada
+// Copyright : 2023-09-23 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
diff --git a/source/par2j/gf16.c b/source/par2j/gf16.c
index b4b5bef..4f3e44c 100644
--- a/source/par2j/gf16.c
+++ b/source/par2j/gf16.c
@@ -80,6 +80,9 @@ void galois_align32_multiply(unsigned char *r1, unsigned char *r2, unsigned int
 void galois_align32avx_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
 void galois_align256_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
 
+void galois_align32_multiply2(unsigned char *src1, unsigned char *src2, unsigned char *dst, unsigned int len, int factor1, int factor2);
+void galois_align32avx_multiply2(unsigned char *src1, unsigned char *src2, unsigned char *dst, unsigned int len, int factor1, int factor2);
+
 void galois_altmap_none(unsigned char *data, unsigned int bsize);
 
 // AVX2 と SSSE3 の ALTMAP は 32バイト単位で行う
@@ -125,6 +128,7 @@ int galois_create_table(void)
 	// CPU によって使う関数を変更する
 	sse_unit = 16;	// 16, 32, 64, 128 のどれでもいい (32のSSSE3は少し速い、GPUが識別するのに注意)
 	galois_align_multiply = galois_align16_multiply;
+	galois_align_multiply2 = NULL;
 	galois_altmap_change = galois_altmap_none;
 	galois_altmap_return = galois_altmap_none;
 	checksum16_altmap = checksum16;
@@ -135,6 +139,7 @@ int galois_create_table(void)
 		//printf("\nUse AVX2 & ALTMAP\n");
 		sse_unit = 32;	// 32, 64, 128 のどれでもいい
 		galois_align_multiply = galois_align32avx_multiply;
+		galois_align_multiply2 = galois_align32avx_multiply2;
 		galois_altmap_change = galois_altmap32_change;
 		galois_altmap_return = galois_altmap32_return;
 		checksum16_altmap = checksum16_altmap32;
@@ -144,6 +149,7 @@ int galois_create_table(void)
 			//printf("\nUse SSSE3 & ALTMAP\n");
 			sse_unit = 32;	// 32, 64, 128 のどれでもいい
 			galois_align_multiply = galois_align32_multiply;
+			galois_align_multiply2 = galois_align32_multiply2;
 			galois_altmap_change = galois_altmap32_change;
 			galois_altmap_return = galois_altmap32_return;
 			checksum16_altmap = checksum16_altmap32;
@@ -154,6 +160,7 @@ int galois_create_table(void)
 			//printf("\nUse JIT(SSE2) & ALTMAP\n");
 			sse_unit = 256;
 			galois_align_multiply = galois_align256_multiply;
+			galois_align_multiply2 = NULL;
 			galois_altmap_change = galois_altmap256_change;
 			galois_altmap_return = galois_altmap256_return;
 			checksum16_altmap = checksum16_altmap256;
@@ -777,19 +784,21 @@ lp32:
 #else	// 64-bit 版ではインライン・アセンブラを使えない
 // (__m128i *) で逐次ポインターをキャスト変換するよりも、
 // 先に __m128i* で定義しておいた方が、連続した領域へのアクセス最適化がうまくいく？
+// ほとんど変わらない気がする（むしろ遅い？）・・・コンパイラ次第なのかも
 
 // tables for split four combined multiplication
-static void create_eight_table(unsigned char *mtab, int factor){
+static void create_eight_table(unsigned char *mtab, int factor)
+{
 	int count = 4;
 	__m128i *tbl;
-	__m128i xmm0, xmm1, xmm2, xmm3, xmm7;
+	__m128i xmm0, xmm1, xmm2, xmm3, mask;
 
 	tbl = (__m128i *)mtab;
 
 	// create mask for 8-bit
-	xmm7 = _mm_setzero_si128();
-	xmm7 = _mm_cmpeq_epi16(xmm7, xmm7);	// 0xFFFF *8
-	xmm7 = _mm_srli_epi16(xmm7, 8);		// 0x00FF *8
+	mask = _mm_setzero_si128();
+	mask = _mm_cmpeq_epi16(mask, mask);	// 0xFFFF *8
+	mask = _mm_srli_epi16(mask, 8);		// 0x00FF *8
 
 	while (1){
 		xmm0 = _mm_cvtsi32_si128(factor);			// [_][_][_][_][_][_][_][1]
@@ -817,8 +826,8 @@ static void create_eight_table(unsigned char *mtab, int factor){
 
 		xmm0 = _mm_load_si128(&xmm2);
 		xmm1 = _mm_load_si128(&xmm3);
-		xmm0 = _mm_and_si128(xmm0, xmm7);
-		xmm1 = _mm_and_si128(xmm1, xmm7);
+		xmm0 = _mm_and_si128(xmm0, mask);
+		xmm1 = _mm_and_si128(xmm1, mask);
 		xmm0 = _mm_packus_epi16(xmm0, xmm1);	// lower 8-bit * 16
 		xmm2 = _mm_srli_epi16(xmm2, 8);
 		xmm3 = _mm_srli_epi16(xmm3, 8);
@@ -911,13 +920,9 @@ static void gf16_ssse3_block16u(unsigned char *input, unsigned char *output, uns
 // Address (input) does not need be 16-byte aligned
 static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
 {
-	__m128i *src, *dst;
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 	__m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
 
-	src = (__m128i *)input;
-	dst = (__m128i *)output;
-
 	// copy tables to local
 	tbl0 = _mm_load_si128((__m128i *)table);
 	tbl1 = _mm_load_si128((__m128i *)table + 1);
@@ -936,8 +941,8 @@ static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, uns
 	xmm7 = _mm_packus_epi16(xmm7, xmm7);	// 0x0F *16
 
 	while (bsize != 0){
-		xmm1 = _mm_loadu_si128(src);	// read source 32-bytes
-		xmm3 = _mm_loadu_si128(src + 1);
+		xmm1 = _mm_loadu_si128((__m128i *)input);	// read source 32-bytes
+		xmm3 = _mm_loadu_si128((__m128i *)input + 1);
 		xmm0 = _mm_and_si128(xmm1, xmm6);	// erase higher byte
 		xmm2 = _mm_and_si128(xmm3, xmm6);
 		xmm1 = _mm_srli_epi16(xmm1, 8);		// move higher byte to lower
@@ -975,17 +980,17 @@ static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, uns
 		xmm4 = _mm_xor_si128(xmm4, xmm2);	// combine result
 		xmm5 = _mm_xor_si128(xmm5, xmm3);
 
-		xmm0 = _mm_load_si128(dst);	// read dest 32-bytes
-		xmm1 = _mm_load_si128(dst + 1);
+		xmm0 = _mm_load_si128((__m128i *)output);	// read dest 32-bytes
+		xmm1 = _mm_load_si128((__m128i *)output + 1);
 		xmm3 = _mm_unpacklo_epi8(xmm4, xmm5);	// interleave lower and higher bytes
 		xmm4 = _mm_unpackhi_epi8(xmm4, xmm5);
 		xmm0 = _mm_xor_si128(xmm0, xmm3);
 		xmm1 = _mm_xor_si128(xmm1, xmm4);
-		_mm_store_si128(dst, xmm0);	// write dest 32-bytes
-		_mm_store_si128(dst + 1, xmm1);
+		_mm_store_si128((__m128i *)output, xmm0);	// write dest 32-bytes
+		_mm_store_si128((__m128i *)output + 1, xmm1);
 
-		src += 2;
-		dst += 2;
+		input += 32;
+		output += 32;
 		bsize -= 32;
 	}
 }
@@ -993,13 +998,9 @@ static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, uns
 // xmm レジスタにテーブルを読み込む方が 64-bit 版で微妙に速い
 static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
 {
-	__m128i *src, *dst;
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7;
 	__m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
 
-	src = (__m128i *)input;
-	dst = (__m128i *)output;
-
 	// copy tables to local
 	tbl0 = _mm_load_si128((__m128i *)table);
 	tbl1 = _mm_load_si128((__m128i *)table + 1);
@@ -1017,8 +1018,8 @@ static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *outpu
 	xmm7 = _mm_packus_epi16(xmm7, xmm7);	// 0x0F *16
 
 	while (bsize != 0){
-		xmm0 = _mm_load_si128(src);	// read source 32-bytes
-		xmm1 = _mm_load_si128(src + 1);
+		xmm0 = _mm_load_si128((__m128i *)input);	// read source 32-bytes
+		xmm1 = _mm_load_si128((__m128i *)input + 1);
 
 		xmm3 = _mm_load_si128(&xmm0);	// copy source
 		xmm0 = _mm_srli_epi16(xmm0, 4);	// prepare next 4-bit
@@ -1054,17 +1055,17 @@ static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *outpu
 		xmm2 = _mm_shuffle_epi8(xmm2, xmm0);	// table look-up
 		xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
 
-		xmm0 = _mm_load_si128(dst);	// read dest 32-bytes
-		xmm1 = _mm_load_si128(dst + 1);
+		xmm0 = _mm_load_si128((__m128i *)output);	// read dest 32-bytes
+		xmm1 = _mm_load_si128((__m128i *)output + 1);
 		xmm4 = _mm_xor_si128(xmm4, xmm2);	// combine result
 		xmm5 = _mm_xor_si128(xmm5, xmm3);
 		xmm4 = _mm_xor_si128(xmm4, xmm0);
 		xmm5 = _mm_xor_si128(xmm5, xmm1);
-		_mm_store_si128(dst, xmm4);	// write dest 32-bytes
-		_mm_store_si128(dst + 1, xmm5);
+		_mm_store_si128((__m128i *)output, xmm4);	// write dest 32-bytes
+		_mm_store_si128((__m128i *)output + 1, xmm5);
 
-		src += 2;
-		dst += 2;
+		input += 32;
+		output += 32;
 		bsize -= 32;
 	}
 }
@@ -1141,16 +1142,288 @@ static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *outpu
 
 #endif
 
+// 逆行列計算用に掛け算だけする（XORで追加しない）
+static void gf16_ssse3_block16s(unsigned char *data, unsigned int bsize, unsigned char *table)
+{
+	__m128i dest, mask, xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
+	__m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
+
+	// copy tables to local
+	tbl0 = _mm_load_si128((__m128i *)table);
+	tbl1 = _mm_load_si128((__m128i *)table + 1);
+	tbl2 = _mm_load_si128((__m128i *)table + 2);
+	tbl3 = _mm_load_si128((__m128i *)table + 3);
+	tbl4 = _mm_load_si128((__m128i *)table + 4);
+	tbl5 = _mm_load_si128((__m128i *)table + 5);
+	tbl6 = _mm_load_si128((__m128i *)table + 6);
+	tbl7 = _mm_load_si128((__m128i *)table + 7);
+
+	// create mask for 8 entries
+	mask = _mm_setzero_si128();
+	mask = _mm_cmpeq_epi16(mask, mask);	// 0xFFFF *8
+	mask = _mm_srli_epi16(mask, 12);	// 0x000F *8
+
+	while (bsize != 0){
+		xmm0 = _mm_load_si128((__m128i *)data);	// read source 16-bytes
+
+		xmm3 = _mm_load_si128(&tbl0);	// low table
+		xmm4 = _mm_load_si128(&tbl1);	// high table
+		xmm1 = _mm_load_si128(&xmm0);			// copy source
+		xmm0 = _mm_srli_epi16(xmm0, 4);			// prepare next 4-bit
+		xmm1 = _mm_and_si128(xmm1, mask);		// src & 0x000F
+		xmm3 = _mm_shuffle_epi8(xmm3, xmm1);	// table look-up
+		xmm1 = _mm_slli_epi16(xmm1, 8);			// shift 8-bit for higher table
+		xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
+			xmm5 = _mm_load_si128(&tbl2);	// low table
+			xmm6 = _mm_load_si128(&tbl3);	// high table
+		dest = _mm_xor_si128(xmm3, xmm4);	// combine high and low
+
+			xmm1 = _mm_load_si128(&xmm0);			// copy source
+			xmm0 = _mm_srli_epi16(xmm0, 4);			// prepare next 4-bit
+			xmm1 = _mm_and_si128(xmm1, mask);		// src & 0x000F
+			xmm5 = _mm_shuffle_epi8(xmm5, xmm1);	// table look-up
+			xmm1 = _mm_slli_epi16(xmm1, 8);			// shift 8-bit for higher table
+			xmm6 = _mm_shuffle_epi8(xmm6, xmm1);
+		xmm3 = _mm_load_si128(&tbl4);	// low table
+		xmm4 = _mm_load_si128(&tbl5);	// high table
+			xmm5 = _mm_xor_si128(xmm5, xmm6);	// combine high and low
+			dest = _mm_xor_si128(dest, xmm5);
+
+		xmm1 = _mm_load_si128(&xmm0);			// copy source
+		xmm0 = _mm_srli_epi16(xmm0, 4);			// prepare next 4-bit
+		xmm1 = _mm_and_si128(xmm1, mask);		// src & 0x000F
+		xmm3 = _mm_shuffle_epi8(xmm3, xmm1);	// table look-up
+		xmm1 = _mm_slli_epi16(xmm1, 8);			// shift 8-bit for higher table
+		xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
+			xmm5 = _mm_load_si128(&tbl6);	// low table
+			xmm6 = _mm_load_si128(&tbl7);	// high table
+		xmm3 = _mm_xor_si128(xmm3, xmm4);	// combine high and low
+		dest = _mm_xor_si128(dest, xmm3);
+
+			xmm5 = _mm_shuffle_epi8(xmm5, xmm0);	// table look-up
+			xmm0 = _mm_slli_epi16(xmm0, 8);			// shift 8-bit for higher table
+			xmm6 = _mm_shuffle_epi8(xmm6, xmm0);
+			xmm5 = _mm_xor_si128(xmm5, xmm6);	// combine high and low
+			dest = _mm_xor_si128(dest, xmm5);
+
+		_mm_store_si128((__m128i *)data, dest);
+
+		data += 16;
+		bsize -= 16;
+	}
+}
+
+// ２ブロック同時に計算することで、メモリーへのアクセス回数を減らす
+// 128バイトのテーブルを２個用意しておくこと
+// xmm レジスタの数が足りないので、テーブルを毎回ロードする
+static void gf16_ssse3_block32_altmap2(unsigned char *input1, unsigned char *input2, unsigned char *output, unsigned int bsize, unsigned char *table)
+{
+	__m128i *tbl;
+	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, mask;
+
+	tbl = (__m128i *)table;
+
+	// create mask for 16 entries
+	mask = _mm_setzero_si128();
+	mask = _mm_cmpeq_epi16(mask, mask);	// 0xFFFF *8
+	mask = _mm_srli_epi16(mask, 12);	// 0x000F *8
+	mask = _mm_packus_epi16(mask, mask);	// 0x0F *16
+
+	while (bsize != 0){
+		xmm0 = _mm_load_si128((__m128i *)input1);	// read source 32-bytes
+		xmm1 = _mm_load_si128((__m128i *)input1 + 1);
+
+		xmm6 = _mm_load_si128(&xmm0);	// copy source
+		xmm0 = _mm_srli_epi16(xmm0, 4);	// prepare next 4-bit
+		xmm6 = _mm_and_si128(xmm6, mask);	// src & 0x0F
+		xmm0 = _mm_and_si128(xmm0, mask);	// (src >> 4) & 0x0F
+
+		xmm4 = _mm_load_si128(tbl);	// load tables
+		xmm5 = _mm_load_si128(tbl + 1);
+		xmm4 = _mm_shuffle_epi8(xmm4, xmm6);	// table look-up
+		xmm5 = _mm_shuffle_epi8(xmm5, xmm6);
+
+		xmm2 = _mm_load_si128(tbl + 2);	// load tables
+		xmm3 = _mm_load_si128(tbl + 3);
+		xmm2 = _mm_shuffle_epi8(xmm2, xmm0);	// table look-up
+		xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
+		xmm4 = _mm_xor_si128(xmm4, xmm2);	// combine result
+		xmm5 = _mm_xor_si128(xmm5, xmm3);
+
+		xmm0 = _mm_load_si128(&xmm1);	// copy source
+		xmm0 = _mm_srli_epi16(xmm0, 4);	// prepare next 4-bit
+		xmm1 = _mm_and_si128(xmm1, mask);	// src & 0x0F
+		xmm0 = _mm_and_si128(xmm0, mask);	// (src >> 4) & 0x0F
+
+		xmm2 = _mm_load_si128(tbl + 4);	// load tables
+		xmm3 = _mm_load_si128(tbl + 5);
+		xmm2 = _mm_shuffle_epi8(xmm2, xmm1);	// table look-up
+		xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
+		xmm4 = _mm_xor_si128(xmm4, xmm2);	// combine result
+		xmm5 = _mm_xor_si128(xmm5, xmm3);
+
+		xmm2 = _mm_load_si128(tbl + 6);	// load tables
+		xmm3 = _mm_load_si128(tbl + 7);
+		xmm2 = _mm_shuffle_epi8(xmm2, xmm0);	// table look-up
+		xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
+		xmm4 = _mm_xor_si128(xmm4, xmm2);	// combine result
+		xmm5 = _mm_xor_si128(xmm5, xmm3);
+
+			xmm0 = _mm_load_si128((__m128i *)input2);	// read source 32-bytes
+			xmm1 = _mm_load_si128((__m128i *)input2 + 1);
+
+			xmm6 = _mm_load_si128(&xmm0);	// copy source
+			xmm0 = _mm_srli_epi16(xmm0, 4);	// prepare next 4-bit
+			xmm6 = _mm_and_si128(xmm6, mask);	// src & 0x0F
+			xmm0 = _mm_and_si128(xmm0, mask);	// (src >> 4) & 0x0F
+
+			xmm2 = _mm_load_si128(tbl + 8);	// load tables
+			xmm3 = _mm_load_si128(tbl + 9);
+			xmm2 = _mm_shuffle_epi8(xmm2, xmm6);	// table look-up
+			xmm3 = _mm_shuffle_epi8(xmm3, xmm6);
+			xmm4 = _mm_xor_si128(xmm4, xmm2);	// combine result
+			xmm5 = _mm_xor_si128(xmm5, xmm3);
+
+			xmm2 = _mm_load_si128(tbl + 10);	// load tables
+			xmm3 = _mm_load_si128(tbl + 11);
+			xmm2 = _mm_shuffle_epi8(xmm2, xmm0);	// table look-up
+			xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
+			xmm4 = _mm_xor_si128(xmm4, xmm2);	// combine result
+			xmm5 = _mm_xor_si128(xmm5, xmm3);
+
+			xmm0 = _mm_load_si128(&xmm1);	// copy source
+			xmm0 = _mm_srli_epi16(xmm0, 4);	// prepare next 4-bit
+			xmm1 = _mm_and_si128(xmm1, mask);	// src & 0x0F
+			xmm0 = _mm_and_si128(xmm0, mask);	// (src >> 4) & 0x0F
+
+			xmm2 = _mm_load_si128(tbl + 12);	// load tables
+			xmm3 = _mm_load_si128(tbl + 13);
+			xmm2 = _mm_shuffle_epi8(xmm2, xmm1);	// table look-up
+			xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
+			xmm4 = _mm_xor_si128(xmm4, xmm2);	// combine result
+			xmm5 = _mm_xor_si128(xmm5, xmm3);
+
+			xmm2 = _mm_load_si128(tbl + 14);	// load tables
+			xmm3 = _mm_load_si128(tbl + 15);
+			xmm2 = _mm_shuffle_epi8(xmm2, xmm0);	// table look-up
+			xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
+			xmm4 = _mm_xor_si128(xmm4, xmm2);	// combine result
+			xmm5 = _mm_xor_si128(xmm5, xmm3);
+
+		xmm0 = _mm_load_si128((__m128i *)output);	// read dest 32-bytes
+		xmm1 = _mm_load_si128((__m128i *)output + 1);
+		xmm0 = _mm_xor_si128(xmm0, xmm4);
+		xmm1 = _mm_xor_si128(xmm1, xmm5);
+		_mm_store_si128((__m128i *)output, xmm0);	// write dest 32-bytes
+		_mm_store_si128((__m128i *)output + 1, xmm1);
+
+		input1 += 32;
+		input2 += 32;
+		output += 32;
+		bsize -= 32;
+	}
+}
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 // AVX2 命令を使うには Windows 7 以降じゃないといけない
 
 // _mm256_permute2x128_si256 の control の意味は以下を参照
 // http://www.felixcloutier.com/x86/VPERM2I128.html
 
-// テーブルを並び替えて使えば、ループ内の並び替え回数を一回に減らせる
-static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
+// AVX2 を使って全体を２倍していくと、13% ぐらい速くなる
+// でも、テーブル作成が少し速くなっても、全体的な速度はほとんど変わらない・・・
+static void create_eight_table_avx2(unsigned char *mtab, int factor)
 {
-	__m256i tbl0, tbl1, tbl2, tbl3, mask, src0, src1, tmp0, tmp1, tmp2, tmp3;
+	int count;
+	__m128i xmm0, xmm1, xmm2, xmm3, mask8;
+	__m256i ymm0, ymm1, ymm2, ymm3, base, poly, mask16;
+
+	// create mask for 8-bit
+	mask8 = _mm_setzero_si128();
+	mask8 = _mm_cmpeq_epi16(mask8, mask8);	// 0xFFFF *8
+	mask8 = _mm_srli_epi16(mask8, 8);		// 0x00FF *8
+
+	xmm0 = _mm_cvtsi32_si128(factor);			// [_][_][_][_][_][_][_][1]
+	xmm1 = _mm_setzero_si128();
+	factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
+	xmm1 = _mm_insert_epi16(xmm1, factor, 1);	// [_][_][_][_][_][_][2][_]
+	xmm2 = _mm_setzero_si128();
+	factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
+	xmm2 = _mm_insert_epi16(xmm2, factor, 4);	// [_][_][_][4][_][_][_][_]
+	xmm1 = _mm_unpacklo_epi16(xmm1, xmm1);		// [_][_][_][_][2][2][_][_]
+	factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
+	xmm3 = _mm_cvtsi32_si128(factor);			// [_][_][_][_][_][_][_][8]
+
+	xmm0 = _mm_shufflelo_epi16(xmm0, _MM_SHUFFLE(0, 1, 0, 1));	// [_][_][_][_][1][_][1][_]
+	xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);						// [_][_][_][_][_][_][8][8]
+	xmm0 = _mm_xor_si128(xmm0, xmm1);							// [_][_][_][_][3][2][1][_]
+	xmm2 = _mm_shufflehi_epi16(xmm2, _MM_SHUFFLE(0, 0, 0, 0));	// [4][4][4][4][_][_][_][_]
+	xmm0 = _mm_unpacklo_epi64(xmm0, xmm0);						// [3][2][1][_][3][2][1][_]
+	xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0, 0, 0, 0));	// [8][8][8][8][8][8][8][8]
+	xmm2 = _mm_xor_si128(xmm2, xmm0);							// [7][6][5][4][3][2][1][_]
+	xmm3 = _mm_xor_si128(xmm3, xmm2);							// [15][14][13][12][11][10][9][8]
+
+	// 途中で AVX2 命令を使っても遅くならないっぽい
+	poly = _mm256_set1_epi32(0x100B100B);	// PRIM_POLY = 0x1100B * 16
+	mask16 = _mm256_cmpeq_epi16(poly, poly);
+	mask16 = _mm256_srli_epi16(mask16, 8);	// 0x00FF *16
+	base = _mm256_setzero_si256();
+	base = _mm256_inserti128_si256(base, xmm2, 0);
+	base = _mm256_inserti128_si256(base, xmm3, 1);
+
+	// ymm レジスタに読み込んでる間にメモリーに書き込んだ方が速い
+	xmm0 = _mm_and_si128(xmm2, mask8);
+	xmm1 = _mm_and_si128(xmm3, mask8);
+	xmm0 = _mm_packus_epi16(xmm0, xmm1);	// lower 8-bit * 16
+	xmm2 = _mm_srli_epi16(xmm2, 8);
+	xmm3 = _mm_srli_epi16(xmm3, 8);
+	xmm2 = _mm_packus_epi16(xmm2, xmm3);	// higher 8-bit * 16
+	_mm_store_si128((__m128i *)mtab    , xmm0);
+	_mm_store_si128((__m128i *)mtab + 1, xmm2);
+
+	for (count = 1; count < 4; count++){
+		// 全体を２倍する
+		ymm0 = _mm256_slli_epi16(base, 1);
+		ymm1 = _mm256_srai_epi16(base, 15);
+		ymm1 = _mm256_and_si256(ymm1, poly);
+		base = _mm256_xor_si256(ymm1, ymm0);
+
+		// 全体を２倍する
+		ymm0 = _mm256_slli_epi16(base, 1);
+		ymm1 = _mm256_srai_epi16(base, 15);
+		ymm1 = _mm256_and_si256(ymm1, poly);
+		base = _mm256_xor_si256(ymm1, ymm0);
+
+		// 全体を２倍する
+		ymm0 = _mm256_slli_epi16(base, 1);
+		ymm1 = _mm256_srai_epi16(base, 15);
+		ymm1 = _mm256_and_si256(ymm1, poly);
+		base = _mm256_xor_si256(ymm1, ymm0);
+
+		// 全体を２倍する
+		ymm0 = _mm256_slli_epi16(base, 1);
+		ymm1 = _mm256_srai_epi16(base, 15);
+		ymm1 = _mm256_and_si256(ymm1, poly);
+		base = _mm256_xor_si256(ymm1, ymm0);
+
+		// 並び替えて保存する
+		ymm0 = _mm256_and_si256(base, mask16);	// lower 8-bit * 16
+		ymm1 = _mm256_srli_epi16(base, 8);		// higher 8-bit * 16
+		ymm2 = _mm256_permute2x128_si256(ymm0, ymm1, 0x20);
+		ymm3 = _mm256_permute2x128_si256(ymm0, ymm1, 0x31);
+		ymm0 = _mm256_packus_epi16(ymm2, ymm3);
+		_mm256_store_si256((__m256i *)mtab + count, ymm0);
+	}
+
+	// AVX-SSE 切り替えの回避
+	_mm256_zeroupper();
+}
+
+// 逆行列計算用に掛け算だけする（XORで追加しない）
+static void gf16_avx2_block32s(unsigned char *data, unsigned int bsize, unsigned char *table)
+{
+	__m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
+	__m256i mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
 
 	// copy tables to local
 	tmp0 = _mm256_load_si256((__m256i *)table);		// tbl0[low0][high0] <- 0x0f[lo][lo]
@@ -1158,11 +1431,152 @@ static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsig
 	tmp2 = _mm256_load_si256((__m256i *)table + 2);	// tbl2[low2][high2] <- 0x0f[hi][hi]
 	tmp3 = _mm256_load_si256((__m256i *)table + 3);	// tbl3[low3][high3] <- 0xf0[hi][hi]
 
-	// re-arrange table order
-	tbl0 = _mm256_permute2x128_si256(tmp0, tmp2, 0x30);	// tblA[low0][high2] <- 0x0f[lo][hi]
-	tbl1 = _mm256_permute2x128_si256(tmp1, tmp3, 0x30);	// tblB[low1][high3] <- 0xf0[lo][hi]
-	tbl2 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03);	// tblC[high0][low2] <- 0x0f[lo][hi]
-	tbl3 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03);	// tblD[high1][low3] <- 0xf0[lo][hi]
+	// split to 8 tables
+	tbl0 = _mm256_permute2x128_si256(tmp0, tmp0, 0x00);	// tbl0[low0][low0]
+	tbl1 = _mm256_permute2x128_si256(tmp1, tmp1, 0x00);	// tbl1[low1][low1]
+	tbl2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x00);	// tbl2[low2][low2]
+	tbl3 = _mm256_permute2x128_si256(tmp3, tmp3, 0x00);	// tbl3[low3][low3]
+	tbl4 = _mm256_permute2x128_si256(tmp0, tmp0, 0x11);	// tbl0[high0][high0]
+	tbl5 = _mm256_permute2x128_si256(tmp1, tmp1, 0x11);	// tbl1[high1][high1]
+	tbl6 = _mm256_permute2x128_si256(tmp2, tmp2, 0x11);	// tbl2[high2][high2]
+	tbl7 = _mm256_permute2x128_si256(tmp3, tmp3, 0x11);	// tbl3[high3][high3]
+
+	// create mask for 16 entries
+	mask = _mm256_cmpeq_epi16(tmp0, tmp0);	// 0xFFFF *16
+	mask = _mm256_srli_epi16(mask, 12);		// 0x000F *16
+
+	while (bsize != 0){
+		src0 = _mm256_load_si256((__m256i *)data);	// read source 32-bytes
+
+		src1 = _mm256_and_si256(src0, mask);	// src & 0x0F
+		tmp0 = _mm256_shuffle_epi8(tbl0, src1);	// table look-up
+		src1 = _mm256_slli_epi16(src1, 8);		// shift 8-bit for higher table
+		tmp1 = _mm256_shuffle_epi8(tbl4, src1);
+		src0 = _mm256_srli_epi16(src0, 4);		// prepare next 4-bit
+		dest = _mm256_xor_si256(tmp0, tmp1);	// combine high and low
+
+		src1 = _mm256_and_si256(src0, mask);	// src & 0x0F
+		tmp0 = _mm256_shuffle_epi8(tbl1, src1);	// table look-up
+		src1 = _mm256_slli_epi16(src1, 8);		// shift 8-bit for higher table
+		tmp1 = _mm256_shuffle_epi8(tbl5, src1);
+		dest = _mm256_xor_si256(dest, tmp0);	// combine high and low
+		src0 = _mm256_srli_epi16(src0, 4);		// prepare next 4-bit
+		dest = _mm256_xor_si256(dest, tmp1);
+
+		src1 = _mm256_and_si256(src0, mask);	// src & 0x0F
+		tmp0 = _mm256_shuffle_epi8(tbl2, src1);	// table look-up
+		src1 = _mm256_slli_epi16(src1, 8);		// shift 8-bit for higher table
+		tmp1 = _mm256_shuffle_epi8(tbl6, src1);
+		dest = _mm256_xor_si256(dest, tmp0);	// combine high and low
+		src0 = _mm256_srli_epi16(src0, 4);		// prepare next 4-bit
+		dest = _mm256_xor_si256(dest, tmp1);
+
+		src1 = _mm256_and_si256(src0, mask);	// src & 0x0F
+		tmp0 = _mm256_shuffle_epi8(tbl3, src1);	// table look-up
+		src1 = _mm256_slli_epi16(src1, 8);		// shift 8-bit for higher table
+		tmp1 = _mm256_shuffle_epi8(tbl7, src1);
+		dest = _mm256_xor_si256(dest, tmp0);	// combine high and low
+		dest = _mm256_xor_si256(dest, tmp1);
+
+		_mm256_store_si256((__m256i *)data, dest);	// write dest 32-bytes
+
+		data += 32;
+		bsize -= 32;
+	}
+
+	// AVX-SSE 切り替えの回避
+	_mm256_zeroupper();
+}
+
+// 逆行列計算用に ALTMAP されてないソースにも対応しておく
+// Address (input) does not need be 32-byte aligned
+static void gf16_avx2_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
+{
+	__m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
+	__m256i mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
+
+	// copy tables to local
+	tmp0 = _mm256_load_si256((__m256i *)table);		// tbl0[low0][high0] <- 0x0f[lo][lo]
+	tmp1 = _mm256_load_si256((__m256i *)table + 1);	// tbl1[low1][high1] <- 0xf0[lo][lo]
+	tmp2 = _mm256_load_si256((__m256i *)table + 2);	// tbl2[low2][high2] <- 0x0f[hi][hi]
+	tmp3 = _mm256_load_si256((__m256i *)table + 3);	// tbl3[low3][high3] <- 0xf0[hi][hi]
+
+	// split to 8 tables
+	tbl0 = _mm256_permute2x128_si256(tmp0, tmp0, 0x00);	// tbl0[low0][low0]
+	tbl1 = _mm256_permute2x128_si256(tmp1, tmp1, 0x00);	// tbl1[low1][low1]
+	tbl2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x00);	// tbl2[low2][low2]
+	tbl3 = _mm256_permute2x128_si256(tmp3, tmp3, 0x00);	// tbl3[low3][low3]
+	tbl4 = _mm256_permute2x128_si256(tmp0, tmp0, 0x11);	// tbl0[high0][high0]
+	tbl5 = _mm256_permute2x128_si256(tmp1, tmp1, 0x11);	// tbl1[high1][high1]
+	tbl6 = _mm256_permute2x128_si256(tmp2, tmp2, 0x11);	// tbl2[high2][high2]
+	tbl7 = _mm256_permute2x128_si256(tmp3, tmp3, 0x11);	// tbl3[high3][high3]
+
+	// create mask for 16 entries
+	mask = _mm256_cmpeq_epi16(tmp0, tmp0);	// 0xFFFF *16
+	mask = _mm256_srli_epi16(mask, 12);		// 0x000F *16
+
+	while (bsize != 0){
+		src0 = _mm256_loadu_si256((__m256i *)input);	// read source 32-bytes
+		dest = _mm256_load_si256((__m256i *)output);	// read dest 32-bytes
+
+		src1 = _mm256_and_si256(src0, mask);	// src & 0x0F
+		tmp0 = _mm256_shuffle_epi8(tbl0, src1);	// table look-up
+		src1 = _mm256_slli_epi16(src1, 8);		// shift 8-bit for higher table
+		tmp1 = _mm256_shuffle_epi8(tbl4, src1);
+		dest = _mm256_xor_si256(dest, tmp0);	// combine high and low
+		src0 = _mm256_srli_epi16(src0, 4);		// prepare next 4-bit
+		dest = _mm256_xor_si256(dest, tmp1);
+
+		src1 = _mm256_and_si256(src0, mask);	// src & 0x0F
+		tmp0 = _mm256_shuffle_epi8(tbl1, src1);	// table look-up
+		src1 = _mm256_slli_epi16(src1, 8);		// shift 8-bit for higher table
+		tmp1 = _mm256_shuffle_epi8(tbl5, src1);
+		dest = _mm256_xor_si256(dest, tmp0);	// combine high and low
+		src0 = _mm256_srli_epi16(src0, 4);		// prepare next 4-bit
+		dest = _mm256_xor_si256(dest, tmp1);
+
+		src1 = _mm256_and_si256(src0, mask);	// src & 0x0F
+		tmp0 = _mm256_shuffle_epi8(tbl2, src1);	// table look-up
+		src1 = _mm256_slli_epi16(src1, 8);		// shift 8-bit for higher table
+		tmp1 = _mm256_shuffle_epi8(tbl6, src1);
+		dest = _mm256_xor_si256(dest, tmp0);	// combine high and low
+		src0 = _mm256_srli_epi16(src0, 4);		// prepare next 4-bit
+		dest = _mm256_xor_si256(dest, tmp1);
+
+		src1 = _mm256_and_si256(src0, mask);	// src & 0x0F
+		tmp0 = _mm256_shuffle_epi8(tbl3, src1);	// table look-up
+		src1 = _mm256_slli_epi16(src1, 8);		// shift 8-bit for higher table
+		tmp1 = _mm256_shuffle_epi8(tbl7, src1);
+		dest = _mm256_xor_si256(dest, tmp0);	// combine high and low
+		dest = _mm256_xor_si256(dest, tmp1);
+
+		_mm256_store_si256((__m256i *)output, dest);	// write dest 32-bytes
+
+		input += 32;
+		output += 32;
+		bsize -= 32;
+	}
+
+	// AVX-SSE 切り替えの回避
+	_mm256_zeroupper();
+}
+
+// テーブルを並び替えて使えば、ループ内の並び替え回数を一回に減らせる
+static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
+{
+	__m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
+
+	// copy tables to local
+	tmp0 = _mm256_load_si256((__m256i *)table);		// tbl0[low0][high0] <- 0x0f[lo][lo]
+	tmp1 = _mm256_load_si256((__m256i *)table + 1);	// tbl1[low1][high1] <- 0xf0[lo][lo]
+	tmp2 = _mm256_load_si256((__m256i *)table + 2);	// tbl2[low2][high2] <- 0x0f[hi][hi]
+	tmp3 = _mm256_load_si256((__m256i *)table + 3);	// tbl3[low3][high3] <- 0xf0[hi][hi]
+
+	// re-arrange table order (permute より blend の方が速いらしい)
+	tbl0 = _mm256_blend_epi32(tmp0, tmp2, 0xF0);		// tbl0[low0][high2] <- 0x0f[lo][hi]
+	tbl1 = _mm256_blend_epi32(tmp1, tmp3, 0xF0);		// tbl1[low1][high3] <- 0xf0[lo][hi]
+	tbl2 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03);	// tbl2[high0][low2] <- 0x0f[lo][hi]
+	tbl3 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03);	// tbl3[high1][low3] <- 0xf0[lo][hi]
 
 	// create mask for 32 entries
 	mask = _mm256_cmpeq_epi16(tmp0, tmp0);	// 0xFFFF *16
@@ -1184,10 +1598,10 @@ static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsig
 		tmp2 = _mm256_xor_si256(tmp2, tmp3);
 		tmp2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x01);	// exchange low & high 128-bit
 
-		src1 = _mm256_load_si256((__m256i *)output);	// read dest 32-bytes
-		src1 = _mm256_xor_si256(src1, tmp0);
-		src1 = _mm256_xor_si256(src1, tmp2);
-		_mm256_store_si256((__m256i *)output, src1);	// write dest 32-bytes
+		dest = _mm256_load_si256((__m256i *)output);	// read dest 32-bytes
+		tmp0 = _mm256_xor_si256(tmp0, tmp2);
+		dest = _mm256_xor_si256(dest, tmp0);
+		_mm256_store_si256((__m256i *)output, dest);	// write dest 32-bytes
 
 		input += 32;
 		output += 32;
@@ -1300,6 +1714,83 @@ static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsig
 }
 */
 
+// ２ブロック同時に計算することで、メモリーへのアクセス回数を減らす
+// 128バイトのテーブルを２個用意しておくこと
+static void gf16_avx2_block32_2(unsigned char *input1, unsigned char *input2, unsigned char *output, unsigned int bsize, unsigned char *table)
+{
+	__m256i mask, src0, src1, tmp0, tmp1, tmp2, tmp3;
+	__m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
+
+	// copy tables to local
+	tmp0 = _mm256_load_si256((__m256i *)table);		// tbl0[low0][high0] <- 0x0f[lo][lo]
+	tmp1 = _mm256_load_si256((__m256i *)table + 1);	// tbl1[low1][high1] <- 0xf0[lo][lo]
+	tmp2 = _mm256_load_si256((__m256i *)table + 2);	// tbl2[low2][high2] <- 0x0f[hi][hi]
+	tmp3 = _mm256_load_si256((__m256i *)table + 3);	// tbl3[low3][high3] <- 0xf0[hi][hi]
+
+	// re-arrange table order (permute より blend の方が速いらしい)
+	tbl0 = _mm256_blend_epi32(tmp0, tmp2, 0xF0);		// tbl0[low0][high2] <- 0x0f[lo][hi]
+	tbl1 = _mm256_blend_epi32(tmp1, tmp3, 0xF0);		// tbl1[low1][high3] <- 0xf0[lo][hi]
+	tbl2 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03);	// tbl2[high0][low2] <- 0x0f[lo][hi]
+	tbl3 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03);	// tbl3[high1][low3] <- 0xf0[lo][hi]
+
+	tmp0 = _mm256_load_si256((__m256i *)table + 4);
+	tmp1 = _mm256_load_si256((__m256i *)table + 5);
+	tmp2 = _mm256_load_si256((__m256i *)table + 6);
+	tmp3 = _mm256_load_si256((__m256i *)table + 7);
+	tbl4 = _mm256_blend_epi32(tmp0, tmp2, 0xF0);
+	tbl5 = _mm256_blend_epi32(tmp1, tmp3, 0xF0);
+	tbl6 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03);
+	tbl7 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03);
+
+	// create mask for 32 entries
+	mask = _mm256_cmpeq_epi16(tmp0, tmp0);	// 0xFFFF *16
+	mask = _mm256_srli_epi16(mask, 12);		// 0x000F *16
+	mask = _mm256_packus_epi16(mask, mask);	// 0x0F *32
+
+	while (bsize != 0){
+		src0 = _mm256_load_si256((__m256i *)input1);	// read source 32-bytes
+		src1 = _mm256_srli_epi16(src0, 4);	// prepare next 4-bit
+		src0 = _mm256_and_si256(src0, mask);	// src & 0x0F
+		src1 = _mm256_and_si256(src1, mask);	// (src >> 4) & 0x0F
+
+		tmp0 = _mm256_shuffle_epi8(tbl0, src0);	// table look-up
+		tmp1 = _mm256_shuffle_epi8(tbl1, src1);
+		tmp2 = _mm256_shuffle_epi8(tbl2, src0);
+		tmp3 = _mm256_shuffle_epi8(tbl3, src1);
+		tmp0 = _mm256_xor_si256(tmp0, tmp1);	// combine result
+		tmp2 = _mm256_xor_si256(tmp2, tmp3);
+
+			src0 = _mm256_load_si256((__m256i *)input2);	// read source 32-bytes
+			src1 = _mm256_srli_epi16(src0, 4);	// prepare next 4-bit
+			src0 = _mm256_and_si256(src0, mask);	// src & 0x0F
+			src1 = _mm256_and_si256(src1, mask);	// (src >> 4) & 0x0F
+
+			tmp1 = _mm256_shuffle_epi8(tbl4, src0);	// table look-up
+			tmp3 = _mm256_shuffle_epi8(tbl6, src0);
+			tmp0 = _mm256_xor_si256(tmp0, tmp1);	// combine result
+			tmp2 = _mm256_xor_si256(tmp2, tmp3);
+
+			tmp1 = _mm256_shuffle_epi8(tbl5, src1);	// table look-up
+			tmp3 = _mm256_shuffle_epi8(tbl7, src1);
+			tmp0 = _mm256_xor_si256(tmp0, tmp1);	// combine result
+			tmp2 = _mm256_xor_si256(tmp2, tmp3);
+
+		src0 = _mm256_load_si256((__m256i *)output);	// read dest 32-bytes
+		tmp2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x01);	// exchange low & high 128-bit
+		src0 = _mm256_xor_si256(src0, tmp0);
+		src0 = _mm256_xor_si256(src0, tmp2);
+		_mm256_store_si256((__m256i *)output, src0);	// write dest 32-bytes
+
+		input1 += 32;
+		input2 += 32;
+		output += 32;
+		bsize -= 32;
+	}
+
+	// AVX-SSE 切り替えの回避
+	_mm256_zeroupper();
+}
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 // データを並び替えることで、メモリーアクセスを高速化する
@@ -1953,9 +2444,57 @@ void galois_region_multiply(
 		return;
 	}
 
-	if (count >= 32){	// 64バイト以上なら掛け算用のテーブルを使った方が速い
+	if (count >= 64){	// 64バイト以上なら掛け算用のテーブルを使った方が速い
 #ifndef NO_SIMD
-		if (cpu_flag & 1){	// SSSE3 対応なら
+		if (cpu_flag & 16){	// AVX2 対応なら
+			__declspec( align(32) ) unsigned char small_table[128];
+			int s, d;
+
+			create_eight_table_avx2(small_table, factor);
+
+			// アドレスが 32の倍数で無い場合は 32バイト単位で計算する効率が落ちる
+			while ((ULONG_PTR)r2 & 0x1E){
+				// そこで最初の 1～15個(2～30バイト)だけ普通に計算する
+				s = r1[0];
+				d = r2[0];
+				d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
+				r2[0] = (unsigned short)d;
+				r1++;
+				r2++;
+				count--;
+			}
+
+			// 16個ずつ計算するので 16の倍数にする
+			gf16_avx2_block32u((unsigned char *)r1, (unsigned char *)r2,
+					(count & 0xFFFFFFF0) << 1, small_table);
+			r1 += count & 0xFFFFFFF0;
+			r2 += count & 0xFFFFFFF0;
+			count &= 15;
+
+			// 残りは 1個ずつ計算する
+			while (count != 0){
+				s = r1[0];
+				d = r2[0];
+				d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
+				r2[0] = (unsigned short)d;
+				r1++;
+				r2++;
+				count--;
+			}
+
+		} else if (cpu_flag & 1){	// SSSE3 対応なら
 			__declspec( align(16) ) unsigned char small_table[128];
 			int s, d;
 
@@ -2093,30 +2632,125 @@ void galois_region_divide(
 {
 	factor = galois_reciprocal(factor);	// factor = 1 / factor
 
-	if (count >= 32){
-		unsigned int mtab[256 * 2];
+	if (count >= 64){
+// 行列サイズが小さいのでテーブル作成に時間がかかって、全く速くならない・・・
+/*
+#ifndef NO_SIMD
+		if (cpu_flag & 16){	// AVX2 対応なら
+			__declspec( align(32) ) unsigned char small_table[128];
+			int s, d;
 
-		create_two_table(mtab, factor);	// 掛け算用のテーブルをその場で構成する
+			create_eight_table_avx2(small_table, factor);
 
-		// アドレスが 4の倍数で無い場合は 4バイト単位で計算する効率が落ちる
-		if (((ULONG_PTR)r1 & 2) != 0){
-			// そこで最初の 1個(2バイト)だけ普通に計算する
-			r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]);
-			r1++;
-			count--;
+			// アドレスが 32の倍数で無い場合は 32バイト単位で計算する効率が落ちる
+			while ((ULONG_PTR)r1 & 0x1E){
+				// そこで最初の 1～15個(2～30バイト)だけ普通に計算する
+				s = r1[0];
+				d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
+				r1[0] = (unsigned short)d;
+				r1++;
+				count--;
+			}
+
+			// 16個ずつ計算するので 16の倍数にする
+			gf16_avx2_block32s((unsigned char *)r1, (count & 0xFFFFFFF0) << 1, small_table);
+			r1 += count & 0xFFFFFFF0;
+			count &= 15;
+
+			// 残りは 1個ずつ計算する
+			while (count != 0){
+				s = r1[0];
+				d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
+				r1[0] = (unsigned short)d;
+				r1++;
+				count--;
+			}
+
+		} else if (cpu_flag & 1){	// SSSE3 対応なら
+			__declspec( align(16) ) unsigned char small_table[128];
+			int s, d;
+
+			create_eight_table(small_table, factor);
+
+			// アドレスが 16の倍数で無い場合は 16バイト単位で計算する効率が落ちる
+			while ((ULONG_PTR)r1 & 0xE){
+				// そこで最初の 1～7個(2～14バイト)だけ普通に計算する
+				s = r1[0];
+				d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
+				r1[0] = (unsigned short)d;
+				r1++;
+				count--;
+			}
+
+			// 8個ずつ計算するので 8の倍数にする
+			gf16_ssse3_block16s((unsigned char *)r1, (count & 0xFFFFFFF8) << 1, small_table);
+			r1 += count & 0xFFFFFFF8;
+			count &= 7;
+
+			// 残りは 1個ずつ計算する
+			while (count != 0){
+				s = r1[0];
+				d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
+				s = s >> 4;
+				d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
+				r1[0] = (unsigned short)d;
+				r1++;
+				count--;
+			}
+
+		} else {	// Combined Multi Table support (2 tables of 256-entries)
+#endif
+*/
+			unsigned int mtab[256 * 2];
+
+			create_two_table(mtab, factor);	// 掛け算用のテーブルをその場で構成する
+
+			// アドレスが 4の倍数で無い場合は 4バイト単位で計算する効率が落ちる
+			if (((ULONG_PTR)r1 & 2) != 0){
+				// そこで最初の 1個(2バイト)だけ普通に計算する
+				r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]);
+				r1++;
+				count--;
+			}
+
+			// バッファーを 32-bit整数として扱う
+			while (count >= 2){	// 2個(4バイト)ずつ計算する
+				// 先に計算しておいた 2個の参照テーブルを使う
+				((unsigned int *)r1)[0] = mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^
+										((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16);
+				r1 += 2;
+				count -= 2;
+			}
+			// 奇数なら最後に 1個余る
+			if (count == 1)
+				r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]);
+/*
+#ifndef NO_SIMD
 		}
-
-		// バッファーを 32-bit整数として扱う
-		while (count >= 2){	// 2個(4バイト)ずつ計算する
-			// 先に計算しておいた 2個の参照テーブルを使う
-			((unsigned int *)r1)[0] = mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^
-									((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16);
-			r1 += 2;
-			count -= 2;
-		}
-		// 奇数なら最後に 1個余る
-		if (count == 1)
-			r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]);
+#endif
+*/
 
 	} else {	// 小さいデータは普通に計算する
 		int log_y = galois_log_table[factor];
@@ -2271,6 +2905,42 @@ void galois_align32_multiply(
 	}
 }
 
+// 掛け算を２回行って、一度に更新する (SSSE3 & ALTMAP)
+void galois_align32_multiply2(
+	unsigned char *src1,	// Region to multiply (must be aligned by 16)
+	unsigned char *src2,
+	unsigned char *dst,		// Products go here
+	unsigned int len,		// Byte length (must be multiple of 32)
+	int factor1,			// Number to multiply by
+	int factor2)
+{
+	if ((factor1 == 1) && (factor2 == 1)){	// 両方の factor が 1の場合
+		__m128i xmm0, xmm1, xmm2;
+
+		while (len != 0){
+			xmm0 = _mm_load_si128((__m128i *)dst);
+			xmm1 = _mm_load_si128((__m128i *)src1);
+			xmm2 = _mm_load_si128((__m128i *)src2);
+			xmm0 = _mm_xor_si128(xmm0, xmm1);
+			xmm0 = _mm_xor_si128(xmm0, xmm2);
+			_mm_store_si128((__m128i *)dst, xmm0);
+			src1 += 16;
+			src2 += 16;
+			dst += 16;
+			len -= 16;
+		}
+
+	// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
+	} else {
+		__declspec( align(16) ) unsigned char small_table[256];
+
+		create_eight_table(small_table, factor1);
+		create_eight_table(small_table + 128, factor2);
+
+		gf16_ssse3_block32_altmap2(src1, src2, dst, len, small_table);
+	}
+}
+
 // 256バイトごとに並び替えられたバッファー専用の JIT(SSE2) を使った掛け算
 void galois_align256_multiply(
 	unsigned char *r1,	// Region to multiply (must be aligned by 16)
@@ -2321,32 +2991,54 @@ void galois_align32avx_multiply(
 			}
 
 			_mm256_zeroupper();	// AVX-SSE 切り替えの回避
-
-/*
-			__m128i xmm0, xmm1;	// 16バイトごとに XOR する
-
-			while (len != 0){
-				xmm0 = _mm_load_si128((__m128i *)r1);
-				xmm1 = _mm_load_si128((__m128i *)r2);
-				xmm1 = _mm_xor_si128(xmm1, xmm0);
-				_mm_store_si128((__m128i *)r2, xmm1);
-				r1 += 16;
-				r2 += 16;
-				len -= 16;
-			}
-*/
 		}
 
 	// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
 	} else {
 		__declspec( align(32) ) unsigned char small_table[128];
 
-		create_eight_table(small_table, factor);
+		create_eight_table_avx2(small_table, factor);
 
 		gf16_avx2_block32(r1, r2, len, small_table);
 	}
 }
 
+// 掛け算を２回行って、一度に更新する (AVX2 & ALTMAP)
+void galois_align32avx_multiply2(
+	unsigned char *src1,	// Region to multiply (must be aligned by 32)
+	unsigned char *src2,
+	unsigned char *dst,		// Products go here
+	unsigned int len,		// Byte length (must be multiple of 32)
+	int factor1,			// Number to multiply by
+	int factor2)
+{
+	if ((factor1 == 1) && (factor2 == 1)){	// 両方の factor が 1の場合
+		__m256i ymm0, ymm1, ymm2;
+		while (len != 0){
+			ymm0 = _mm256_load_si256((__m256i *)dst);
+			ymm1 = _mm256_load_si256((__m256i *)src1);
+			ymm2 = _mm256_load_si256((__m256i *)src2);
+			ymm0 = _mm256_xor_si256(ymm0, ymm1);
+			ymm0 = _mm256_xor_si256(ymm0, ymm2);
+			_mm256_store_si256((__m256i *)dst, ymm0);
+			src1 += 32;
+			src2 += 32;
+			dst += 32;
+			len -= 32;
+		}
+		_mm256_zeroupper();	// AVX-SSE 切り替えの回避
+
+	// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
+	} else {
+		__declspec( align(32) ) unsigned char small_table[256];
+
+		create_eight_table_avx2(small_table, factor1);
+		create_eight_table_avx2(small_table + 128, factor2);
+
+		gf16_avx2_block32_2(src1, src2, dst, len, small_table);
+	}
+}
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 // チェックサムを計算する
 
diff --git a/source/par2j/gf16.h b/source/par2j/gf16.h
index acd4231..52e95b8 100644
--- a/source/par2j/gf16.h
+++ b/source/par2j/gf16.h
@@ -47,6 +47,15 @@ typedef void (* REGION_MULTIPLY) (
 	int factor);		// Number to multiply by
 REGION_MULTIPLY galois_align_multiply;
 
+typedef void (* REGION_MULTIPLY2) (
+	unsigned char *src1,	// Region to multiply
+	unsigned char *src2,
+	unsigned char *dst,		// Products go here
+	unsigned int len,		// Byte length
+	int factor1,			// Number to multiply by
+	int factor2);
+REGION_MULTIPLY2 galois_align_multiply2;
+
 // 領域並び替え用の関数定義
 typedef void (* REGION_ALTMAP) (unsigned char *data, unsigned int bsize);
 REGION_ALTMAP galois_altmap_change;
diff --git a/source/par2j/lib_opencl.c b/source/par2j/lib_opencl.c
index 804ddd0..419f4fe 100644
--- a/source/par2j/lib_opencl.c
+++ b/source/par2j/lib_opencl.c
@@ -1,5 +1,5 @@
 ﻿// lib_opencl.c
-// Copyright : 2023-06-01 Yutaka Sawada
+// Copyright : 2023-09-23 Yutaka Sawada
 // License : GPL
 
 #ifndef _WIN32_WINNT
@@ -72,11 +72,10 @@ typedef cl_int (CL_API_CALL *API_clEnqueueNDRangeKernel)(cl_command_queue, cl_ke
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 // グローバル変数
 
-extern unsigned int cpu_flag, cpu_cache;	// declared in common2.h
+extern unsigned int cpu_flag;	// declared in common2.h
 extern int cpu_num;
 
 #define MAX_DEVICE 3
-#define MAX_GROUP_NUM 64
 
 HMODULE hLibOpenCL = NULL;
 
@@ -103,18 +102,17 @@ API_clEnqueueNDRangeKernel gfn_clEnqueueNDRangeKernel;
 入力
 OpenCL_method : どのデバイスを選ぶか
 unit_size : ブロックの単位サイズ
+chunk_size: 分割された断片サイズ
 src_max : ソース・ブロック個数
-chunk_size = 0: 標準では分割しない
 
 出力
 return : エラー番号
 src_max : 最大で何ブロックまでソースを読み込めるか
-chunk_size : CPUスレッドの分割サイズ
 OpenCL_method : 動作フラグいろいろ
 */
 
 // 0=成功, 1～エラー番号
-int init_OpenCL(int unit_size, int *src_max, int *chunk_size)
+int init_OpenCL(int unit_size, int chunk_size, int *src_max)
 {
 	char buf[2048], *p_source;
 	int err = 0, i, j;
@@ -141,7 +139,7 @@ int init_OpenCL(int unit_size, int *src_max, int *chunk_size)
 	API_clGetKernelWorkGroupInfo fn_clGetKernelWorkGroupInfo;
 	cl_int ret;
 	cl_uint num_platforms = 0, num_devices = 0, num_groups, param_value;
-	cl_ulong param_value8, cache_size;
+	cl_ulong param_value8;
 	cl_platform_id platform_id[MAX_DEVICE], selected_platform;	// Intel, AMD, Nvidia などドライバーの提供元
 	cl_device_id device_id[MAX_DEVICE], selected_device;	// CPU や GPU など
 	cl_program program;
@@ -309,19 +307,14 @@ int init_OpenCL(int unit_size, int *src_max, int *chunk_size)
 			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &data_size, NULL);
 			if (ret != CL_SUCCESS)
 				continue;
-			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_uint), &param_value, NULL);
-			if (ret != CL_SUCCESS)
-				continue;
-			if (param_value != 0)
-				param_value = 1;
+			// CL_DEVICE_HOST_UNIFIED_MEMORY は OpenCL 2.0 以降で非推奨になったので、参照しない
 
 #ifdef DEBUG_OUTPUT
 			printf("MAX_COMPUTE_UNITS = %d\n", num_groups);
 			printf("MAX_WORK_GROUP_SIZE = %zd\n", data_size);
-			printf("HOST_UNIFIED_MEMORY = %d\n", param_value);
 #endif
-			// MAX_COMPUTE_UNITS * MAX_WORK_GROUP_SIZE で計算力を測る、外付けGPUなら値を倍にする
-			count = (2 - param_value) * (int)data_size * num_groups;
+			// MAX_COMPUTE_UNITS * MAX_WORK_GROUP_SIZE で計算力を測る
+			count = (int)data_size * num_groups;
 			count *= OpenCL_method;	// 符号を変える
 			//printf("prev = %d, now = %d\n", gpu_power, count);
 			if ((count > gpu_power) && (data_size >= 256) &&	// 256以上ないとテーブルを作れない
@@ -330,8 +323,6 @@ int init_OpenCL(int unit_size, int *src_max, int *chunk_size)
 				selected_device = device_id[j];	// 使うデバイスの ID
 				selected_platform = platform_id[i];
 				OpenCL_group_num = num_groups;	// ワークグループ数は COMPUTE_UNITS 数にする
-				if (OpenCL_group_num > MAX_GROUP_NUM)	// 制限を付けてローカルメモリーの消費を抑える
-					OpenCL_group_num = MAX_GROUP_NUM;
 				alloc_max = (size_t)param_value8;
 
 				// AMD Radeon ではメモリー領域が全体の 1/4 とは限らない
@@ -345,26 +336,6 @@ int init_OpenCL(int unit_size, int *src_max, int *chunk_size)
 					if ((cl_ulong)alloc_max > param_value8)
 						alloc_max = (size_t)param_value8;
 				}
-
-				cache_size = 0;
-				ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cl_uint), &num_groups, NULL);
-				if (ret == CL_SUCCESS){
-#ifdef DEBUG_OUTPUT
-					printf("GLOBAL_MEM_CACHE_TYPE = %d\n", num_groups);
-#endif
-					if (num_groups & 3){	// CL_READ_ONLY_CACHE or CL_READ_WRITE_CACHE
-						ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(cl_ulong), &cache_size, NULL);
-						if (ret == CL_SUCCESS){
-#ifdef DEBUG_OUTPUT
-							printf("GLOBAL_MEM_CACHE_SIZE = %I64d KB\n", cache_size >> 10);
-#endif
-							if (param_value != 0){	// 内蔵 GPU なら CPU との共有キャッシュを活用する
-								if (cache_size >= 1048576)	// サイズが小さい場合は分割しない
-									cache_size |= 0x40000000;
-							}
-						}
-					}
-				}
 			}
 		}
 	}
@@ -395,67 +366,28 @@ int init_OpenCL(int unit_size, int *src_max, int *chunk_size)
 		return (ret << 8) | 12;
 
 	// 計算方式を選択する
-	gpu_power = unit_size;	// unit_size は MEM_UNIT の倍数になってる
 	if ((((cpu_flag & 0x101) == 1) || ((cpu_flag & 16) != 0)) && (sse_unit == 32)){
 		OpenCL_method = 2;	// SSSE3 & ALTMAP または AVX2 ならデータの並び替え対応版を使う
-		if (cache_size & 0x40000000){	// 内蔵 GPU でキャッシュを利用できるなら、CPUスレッドと同じにする
-			j = cpu_cache & 0x7FFF8000;	// CPUのキャッシュ上限サイズ
-			count = (int)(cache_size & 0x3FFFFFFF) / 4;	// ただし、認識できるサイズの 1/4 までにする
-			if ((j == 0) || (j > count))
-				j = count;
-			count = 1;
-			while (gpu_power > j){	// 制限サイズより大きいなら
-				// 分割数を増やして chunk のサイズを試算してみる
-				count++;
-				gpu_power = (unit_size + count - 1) / count;
-				gpu_power = (gpu_power + (MEM_UNIT - 1)) & ~(MEM_UNIT - 1);	// MEM_UNITの倍数にする
-			}
-			if (count > 1){
-				*chunk_size = gpu_power;
-				OpenCL_method = 3;
-#ifdef DEBUG_OUTPUT
-				printf("gpu cache: limit size = %d, chunk size = %d, split = %d\n", j, gpu_power, count);
-#endif
-			}
-/*
-		// 32バイト単位のメモリーアクセスならキャッシュする必要なし？計算速度が半減する・・・
-		} else if ((cache_size & 0x3FFFFFFF) > OpenCL_group_num * 4096){	// 2KB の倍はいるかも？
-#ifdef DEBUG_OUTPUT
-			printf("gpu: cache size = %d, read size = %d\n", cache_size & 0x3FFFFFFF, OpenCL_group_num * 2048);
-#endif
-			OpenCL_method = 1;
-*/
-		}
-
 	} else if (((cpu_flag & 128) != 0) && (sse_unit == 256)){
 		OpenCL_method = 4;	// JIT(SSE2) は bit ごとに上位から 16バイトずつ並ぶ
 		// ローカルのテーブルサイズが異なることに注意
 		// XOR 方式以外は 2KB (4バイト * 256項目 * 2個) 使う
 		// XOR (JIT) は 64バイト (4バイト * 16項目) 使う
-#ifdef DEBUG_OUTPUT
-//		printf("4 KB cache (16-bytes * 256 work items), use if\n");
-#endif
 	} else {
-		OpenCL_method = 1;	// MMX用のコードは遅いので、キャッシュ最適化する必要が無い
+		OpenCL_method = 1;	// 並び替えられてないデータ用
 	}
 
 	// work group 数が必要以上に多い場合は減らす
-/*
-	if (OpenCL_method == 4){
-		// work item 一個が 16バイトずつ計算する、256個なら work group ごとに 4KB 担当する
-		data_size = unit_size / 4096;
-	} else 
-*/
-	if (OpenCL_method & 2){
+	if (OpenCL_method == 2){
 		// work item 一個が 8バイトずつ計算する、256個なら work group ごとに 2KB 担当する
-		data_size = unit_size / 2048;
+		data_size = chunk_size / 2048;
 	} else {
 		// work item 一個が 4バイトずつ計算する、256個なら work group ごとに 1KB 担当する
-		data_size = unit_size / 1024;
+		data_size = chunk_size / 1024;
 	}
 	if (OpenCL_group_num > data_size){
 		OpenCL_group_num = data_size;
-		printf("Number of work groups is reduced to %d\n", (int)OpenCL_group_num);
+		printf("Number of work groups is reduced to %zd\n", OpenCL_group_num);
 	}
 
 	// 最大で何ブロック分のメモリー領域を保持できるのか（ここではまだ確保しない）
@@ -469,9 +401,9 @@ int init_OpenCL(int unit_size, int *src_max, int *chunk_size)
 	printf("src buf : %zd KB (%d blocks), possible\n", data_size >> 10, count);
 #endif
 
-	// 出力先は1ブロック分だけあればいい
+	// 出力先はchunk 1個分だけあればいい
 	// CL_MEM_ALLOC_HOST_PTRを使えばpinned memoryになるらしい
-	data_size = unit_size;
+	data_size = (chunk_size + 63) & ~63;	//  cache line sizes (64 bytes) の倍数にする
 	OpenCL_dst = gfn_clCreateBuffer(OpenCL_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, data_size, NULL, &ret);
 	if (ret != CL_SUCCESS)
 		return (ret << 8) | 13;
@@ -574,7 +506,7 @@ int init_OpenCL(int unit_size, int *src_max, int *chunk_size)
 	FreeResource(glob);	// not required ?
 
 	// 定数を指定する
-	wsprintfA(buf, "-D BLK_SIZE=%d -D CHK_SIZE=%d", unit_size / 4, gpu_power / 4);
+	wsprintfA(buf, "-cl-fast-relaxed-math -D BLK_SIZE=%d", unit_size / 4);
 
 	// 使用する OpenCL デバイス用にコンパイルする
 	ret = fn_clBuildProgram(program, 1, &selected_device, buf, NULL, NULL);
@@ -768,11 +700,12 @@ int gpu_copy_blocks(
 }
 
 // ソース・ブロックを掛け算する
-int gpu_multiply_blocks(
+int gpu_multiply_chunks(
 	int src_num,			// Number of multiplying source blocks
 	unsigned short *mat,	// Matrix of numbers to multiply by
 	unsigned char *buf,		// Products go here
-	int len)				// Byte length
+	int offset,				// Offset in each block
+	int length)				// Byte length
 {
 	unsigned __int64 *vram, *src, *dst;
 	size_t global_size, local_size;
@@ -787,6 +720,14 @@ int gpu_multiply_blocks(
 	ret = gfn_clSetKernelArg(OpenCL_kernel, 3, sizeof(int), &src_num);
 	if (ret != CL_SUCCESS)
 		return (ret << 8) | 103;
+	offset /= 4;	// 4バイト整数単位にする
+	ret = gfn_clSetKernelArg(OpenCL_kernel, 4, sizeof(int), &offset);
+	if (ret != CL_SUCCESS)
+		return (ret << 8) | 104;
+	length /= 4;	// 4バイト整数単位にする
+	ret = gfn_clSetKernelArg(OpenCL_kernel, 5, sizeof(int), &length);
+	if (ret != CL_SUCCESS)
+		return (ret << 8) | 105;
 
 	// カーネル並列実行
 	local_size = 256;	// テーブルやキャッシュのため、work item 数は 256に固定する
@@ -797,18 +738,18 @@ int gpu_multiply_blocks(
 		return (ret << 8) | 11;
 
 	// 出力内容をホスト側に反映させる
-	vram = gfn_clEnqueueMapBuffer(OpenCL_command, OpenCL_dst, CL_TRUE, CL_MAP_READ, 0, len, 0, NULL, NULL, &ret);
+	vram = gfn_clEnqueueMapBuffer(OpenCL_command, OpenCL_dst, CL_TRUE, CL_MAP_READ, 0, length * 4, 0, NULL, NULL, &ret);
 	if (ret != CL_SUCCESS)
 		return (ret << 8) | 12;
 
 	// 8バイトごとに XOR する (SSE2 で XOR しても速くならず)
 	src = vram;
 	dst = (unsigned __int64 *)buf;
-	while (len > 0){
+	while (length > 0){
 		*dst ^= *src;
 		dst++;
 		src++;
-		len -= 8;
+		length -= 2;
 	}
 
 	// ホスト側でデータを変更しなくても、clEnqueueMapBufferと対で呼び出さないといけない
diff --git a/source/par2j/lib_opencl.h b/source/par2j/lib_opencl.h
index dd121c7..8748245 100644
--- a/source/par2j/lib_opencl.h
+++ b/source/par2j/lib_opencl.h
@@ -10,20 +10,21 @@ extern "C" {
 
 extern int OpenCL_method;
 
-int init_OpenCL(int unit_size, int *src_max, int *chunk_size);
+int init_OpenCL(int unit_size, int chunk_size, int *src_max);
 int free_OpenCL(void);
 void info_OpenCL(char *buf, int buf_size);
 
 int gpu_copy_blocks(
 	unsigned char *data,
 	int unit_size,
-	int src_end);
+	int src_num);
 
-int gpu_multiply_blocks(
+int gpu_multiply_chunks(
 	int src_num,			// Number of multiplying source blocks
 	unsigned short *mat,	// Matrix of numbers to multiply by
 	unsigned char *buf,		// Products go here
-	int len);				// Byte length
+	int offset,				// Offset in each block
+	int length);			// Byte length
 
 int gpu_finish(void);
 
diff --git a/source/par2j/md5_crc.c b/source/par2j/md5_crc.c
index 78d3f8e..0ebf368 100644
--- a/source/par2j/md5_crc.c
+++ b/source/par2j/md5_crc.c
@@ -1,5 +1,5 @@
 ﻿// md5_crc.c
-// Copyright : 2022-10-01 Yutaka Sawada
+// Copyright : 2023-08-28 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -672,7 +672,7 @@ time1_start = GetTickCount();
 
 	// バッファー・サイズが大きいのでヒープ領域を使う
 	for (io_size = IO_SIZE; io_size < 1048576; io_size += IO_SIZE){	// 1 MB までにする
-		if ((io_size + IO_SIZE > (cpu_cache << 17)) || ((__int64)(io_size + IO_SIZE) * 4 > file_left))
+		if ((io_size + IO_SIZE > (cpu_cache & 0xFFFE0000)) || ((__int64)(io_size + IO_SIZE) * 4 > file_left))
 			break;
 	}
 	buf1 = _aligned_malloc(io_size * 2, 64);
@@ -867,7 +867,7 @@ DWORD WINAPI file_hash_crc2(LPVOID lpParameter)
 	// バッファー・サイズが大きいのでヒープ領域を使う
 	prog_tick = 1;
 	for (io_size = IO_SIZE; io_size < 1048576; io_size += IO_SIZE){	// IO_SIZE の倍数で 1 MB までにする
-		if ((io_size + IO_SIZE > (cpu_cache << 17)) || ((__int64)(io_size + IO_SIZE) * 4 > file_left))
+		if ((io_size + IO_SIZE > (cpu_cache & 0xFFFE0000)) || ((__int64)(io_size + IO_SIZE) * 4 > file_left))
 			break;
 		prog_tick++;
 	}
@@ -1304,7 +1304,7 @@ DWORD WINAPI file_hash_background(LPVOID lpParameter)
 
 	// バッファー・サイズが大きいのでヒープ領域を使う
 	for (io_size = IO_SIZE; io_size < 1048576; io_size += IO_SIZE){	// IO_SIZE の倍数で 1 MB までにする
-		if ((io_size + IO_SIZE > (cpu_cache << 17)) || ((__int64)(io_size + IO_SIZE) * 4 > file_size))
+		if ((io_size + IO_SIZE > (cpu_cache & 0xFFFE0000)) || ((__int64)(io_size + IO_SIZE) * 4 > file_size))
 			break;
 	}
 	//printf("\n io_size = %d\n", io_size);
diff --git a/source/par2j/par2.c b/source/par2j/par2.c
index 3e084ad..6dc44cb 100644
--- a/source/par2j/par2.c
+++ b/source/par2j/par2.c
@@ -1,5 +1,5 @@
 ﻿// par2.c
-// Copyright : 2023-03-15 Yutaka Sawada
+// Copyright : 2023-09-21 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -112,12 +112,12 @@ int par2_create(
 		err = -12;
 	} else {
 		// メモリーを確保できるか試す
-		err = read_block_num(parity_num, cpu_num - 1, 0, 256);
+		err = read_block_num(parity_num, 0, 256);
 		if (err == 0)
 			err = -13;
 	}
 #ifdef TIMER
-	printf("read_block_num = %d\n", read_block_num(parity_num, cpu_num - 1, 0, 256));
+	printf("read_block_num = %d\n", read_block_num(parity_num, 0, 256));
 #endif
 	if (err > 0){	// 1-pass方式が可能
 #ifdef TIMER
diff --git a/source/par2j/par2_cmd.c b/source/par2j/par2_cmd.c
index c3149de..553c4a5 100644
--- a/source/par2j/par2_cmd.c
+++ b/source/par2j/par2_cmd.c
@@ -1,5 +1,5 @@
 ﻿// par2_cmd.c
-// Copyright : 2023-03-18 Yutaka Sawada
+// Copyright : 2023-09-18 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -86,7 +86,7 @@ static void print_environment(void)
 
 	printf("CPU thread\t: %d / %d\n", cpu_num & 0xFFFF, cpu_num >> 24);
 	cpu_num &= 0xFFFF;	// 利用するコア数だけにしておく
-	printf("CPU cache limit : %d KB, %d KB\n", (cpu_cache & 0x7FFF8000) >> 10, (cpu_cache & 0x00007FFF) << 7);
+	printf("CPU cache limit : %d KB, %d KB\n", (cpu_flag & 0xFFFF0000) >> 10, (cpu_cache & 0xFFFE0000) >> 10);
 #ifndef _WIN64	// 32-bit 版は MMX, SSE2, SSSE3 のどれかを表示する
 	printf("CPU extra\t:");
 	if (cpu_flag & 1){
@@ -1481,39 +1481,39 @@ ri= switch_set & 0x00040000
 					k = (k * 10) + (tmp_p[j] - '0');
 					j++;
 				}
-				if (k & 32){	// GPU を使う
+				if (k & 256){	// GPU を使う
 					OpenCL_method = 1;	// Faster GPU
-				} else if (k & 64){
+				} else if (k & 512){
 					OpenCL_method = -1;	// Slower GPU
 				}
-				if (k & 16)	// SSSE3 を使わない
-					cpu_flag &= 0xFFFFFFFE;
-				if (k & 128)	// CLMUL を使わない、SSSE3 の古いエンコーダーを使う
+				if (k & 1024)	// CLMUL を使わない、SSSE3 の古いエンコーダーを使う
 					cpu_flag = (cpu_flag & 0xFFFFFFF7) | 0x100;
-				if (k & 256)	// JIT(SSE2) を使わない
+				if (k & 2048)	// JIT(SSE2) を使わない
 					cpu_flag &= 0xFFFFFF7F;
-				if (k & 512)	// AVX2 を使わない
+				if (k & 4096)	// SSSE3 を使わない
+					cpu_flag &= 0xFFFFFFFE;
+				if (k & 8192)	// AVX2 を使わない
 					cpu_flag &= 0xFFFFFFEF;
-				if (k & 15){	// 使用するコア数を変更する
-					k &= 15;	// 1～15 の範囲
+				if (k & 255){	// 使用するコア数を変更する
+					k &= 255;	// 1～255 の範囲
 					// printf("\n lc# = %d , logical = %d, physical = %d \n", k, cpu_num >> 24, (cpu_num & 0x00FF0000) >> 16);
-					if (k == 12){	// 物理コア数の 1/4 にする
+					if (k == 251){	// 物理コア数の 1/4 にする
 						k = ((cpu_num & 0x00FF0000) >> 16) / 4;
-					} else if (k == 13){	// 物理コア数の半分にする
+					} else if (k == 252){	// 物理コア数の半分にする
 						k = ((cpu_num & 0x00FF0000) >> 16) / 2;
-					} else if (k == 14){	// 物理コア数の 3/4 にする
+					} else if (k == 253){	// 物理コア数の 3/4 にする
 						k = (((cpu_num & 0x00FF0000) >> 16) * 3) / 4;
-					} else if (k == 15){	// 物理コア数にする
-						k = (cpu_num & 0x00FF0000) >> 16;
-						if (k >= 6)
-							k--;	// 物理コア数が 6以上なら、1個減らす
-					} else if (k > (cpu_num >> 24)){
-						k = cpu_num >> 24;	// 論理コア数を超えないようにする
+					} else if (k == 254){	// 物理コア数より減らす
+						k = ((cpu_num & 0x00FF0000) >> 16) - 1;
+					} else if (k == 255){	// 物理コア数より増やす
+						k = ((cpu_num & 0x00FF0000) >> 16) + 1;
 					}
 					if (k > MAX_CPU){
 						k = MAX_CPU;
 					} else if (k < 1){
 						k = 1;
+					} else if (k > (cpu_num >> 24)){
+						k = cpu_num >> 24;	// 論理コア数を超えないようにする
 					}
 					cpu_num = (cpu_num & 0xFFFF0000) | k;	// 指定されたコア数を下位に配置する
 				}
diff --git a/source/par2j/reedsolomon.c b/source/par2j/reedsolomon.c
index 003ff06..45282d1 100644
--- a/source/par2j/reedsolomon.c
+++ b/source/par2j/reedsolomon.c
@@ -1,5 +1,5 @@
 ﻿// reedsolomon.c
-// Copyright : 2023-05-29 Yutaka Sawada
+// Copyright : 2023-09-23 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -38,6 +38,13 @@
 #define GPU_SOURCE_COUNT_LIMIT 256
 #define GPU_PARITY_COUNT_LIMIT 32
 
+/*
+#define GPU_DATA_LIMIT 1
+#define GPU_BLOCK_SIZE_LIMIT 32768
+#define GPU_SOURCE_COUNT_LIMIT 16
+#define GPU_PARITY_COUNT_LIMIT 4
+*/
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 // chunk がキャッシュに収まるようにすれば速くなる！ (Cache Blocking という最適化手法)
@@ -46,7 +53,7 @@ int try_cache_blocking(int unit_size)
 	int limit_size, chunk_count, chunk_size, cache_line_diff;
 
 	// CPUキャッシュをどのくらいまで使うか
-	limit_size = cpu_flag & 0x7FFF8000;	// 最低でも 32KB になる
+	limit_size = cpu_flag & 0x7FFF0000;	// 最低でも 64KB になる
 	if (limit_size == 0)	// キャッシュ・サイズを取得できなかった場合は最適化しない
 		return unit_size;
 
@@ -160,7 +167,6 @@ unsigned int get_io_size(
 // 何ブロックまとめてファイルから読み込むかを空きメモリー量から計算する
 int read_block_num(
 	int keep_num,			// 保持するパリティ・ブロック数
-	int add_num,			// 余裕を見るブロック数
 	size_t trial_alloc,		// 確保できるか確認するのか
 	int alloc_unit)			// メモリー単位の境界 (sse_unit か MEM_UNIT)
 {
@@ -177,7 +183,7 @@ int read_block_num(
 
 	if (trial_alloc){
 		__int64 possible_size;
-		possible_size = (__int64)unit_size * (source_num + keep_num + add_num);
+		possible_size = (__int64)unit_size * (source_num + keep_num);
 #ifndef _WIN64	// 32-bit 版なら
 		if (possible_size > MAX_MEM_SIZE)	// 確保する最大サイズを 2GB までにする
 			possible_size = MAX_MEM_SIZE;
@@ -191,13 +197,13 @@ int read_block_num(
 	}
 	mem_size = get_mem_size(trial_alloc) / unit_size;	// 何個分確保できるか
 
-	if (mem_size >= (size_t)(source_num + keep_num + add_num)){	// 最大個数より多い
+	if (mem_size >= (size_t)(source_num + keep_num)){	// 最大個数より多い
 		buf_num = source_num;
-	} else if ((int)mem_size < read_min + keep_num + add_num){	// 少なすぎる
+	} else if ((int)mem_size < read_min + keep_num){	// 少なすぎる
 		buf_num = 0;	// メモリー不足の印
 	} else {	// ソース・ブロック個数を等分割する
 		int split_num;
-		buf_num = (int)mem_size - (keep_num + add_num);
+		buf_num = (int)mem_size - keep_num;
 		split_num = (source_num + buf_num - 1) / buf_num;	// 何回に別けて読み込むか
 		buf_num = (source_num + split_num - 1) / split_num;
 	}
@@ -263,7 +269,7 @@ static int invert_matrix_st(unsigned short *mat,
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 // マルチ・プロセッサー対応
-
+/*
 typedef struct {	// RS threading control struct
 	unsigned short *mat;	// 行列
 	int cols;	// 横行の長さ
@@ -308,8 +314,57 @@ static DWORD WINAPI thread_func(LPVOID lpParameter)
 	CloseHandle(th->end);
 	return 0;
 }
+*/
+typedef struct {	// Maxtrix Inversion threading control struct
+	unsigned short *mat;	// 行列
+	int cols;	// 横行の長さ
+	volatile int start;	// 掛ける行の先頭位置
+	volatile int pivot;	// 倍率となる値の位置
+	volatile int skip;	// とばす行
+	volatile int now;	// 消去する行
+	HANDLE run;
+	HANDLE end;
+} INV_TH;
+
+// サブ・スレッド
+static DWORD WINAPI thread_func(LPVOID lpParameter)
+{
+	unsigned short *mat;
+	int j, cols, row_start2, factor;
+	HANDLE hRun, hEnd;
+	INV_TH *th;
+
+	th = (INV_TH *)lpParameter;
+	mat = th->mat;
+	cols = th->cols;
+	hRun = th->run;
+	hEnd = th->end;
+	SetEvent(hEnd);	// 設定完了を通知する
+
+	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
+	while (th->skip >= 0){
+		while ((j = InterlockedDecrement(&(th->now))) >= 0){	// j = --th_now
+			if (j == th->skip)
+				continue;
+			row_start2 = cols * j;	// その行の開始位置
+			factor = mat[row_start2 + th->pivot];	// j 行の pivot 列の値
+			mat[row_start2 + th->pivot] = 0;	// これが行列を一個で済ます手
+			// 先の計算により、i 行の pivot 列の値は必ず 1なので、この factor が倍率になる
+			galois_region_multiply(mat + th->start, mat + row_start2, cols, factor);
+		}
+		//_mm_sfence();	// メモリーへの書き込みを完了する
+		SetEvent(hEnd);	// 計算終了を通知する
+		WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
+	}
+
+	// 終了処理
+	CloseHandle(hRun);
+	CloseHandle(hEnd);
+	return 0;
+}
 
 // マルチ・スレッドで逆行列を計算する (利用するパリティ・ブロックの所だけ)
+/*
 static int invert_matrix_mt(unsigned short *mat,
 	int rows,				// 横行の数、行列の縦サイズ、失われたソース・ブロックの数 = 利用するパリティ・ブロック数
 	int cols,				// 縦列の数、行列の横サイズ、本来のソース・ブロック数
@@ -411,6 +466,130 @@ static int invert_matrix_mt(unsigned short *mat,
 	CloseHandle(th->h);
 	return 0;
 }
+*/
+
+static int invert_matrix_mt(unsigned short *mat,
+	int rows,				// 横行の数、行列の縦サイズ、失われたソース・ブロックの数 = 利用するパリティ・ブロック数
+	int cols,				// 縦列の数、行列の横サイズ、本来のソース・ブロック数
+	source_ctx_r *s_blk)	// 各ソース・ブロックの情報
+{
+	int err = 0, j, row_start2, factor, sub_num;
+	unsigned int time_last = GetTickCount();
+	HANDLE hSub[MAX_CPU / 2], hRun[MAX_CPU / 2], hEnd[MAX_CPU / 2];
+	INV_TH th[1];
+
+	memset(hSub, 0, sizeof(HANDLE) * (MAX_CPU / 2));
+	memset(th, 0, sizeof(INV_TH));
+
+	// サブ・スレッドの数は平方根（切り上げ）にする
+	sub_num = 1;
+	j = 2;
+	while (j < cpu_num){	// 1~2=1, 3~4=2, 5~8=3, 9~16=4, 17~32=5
+		sub_num++;
+		j *= 2;
+	}
+	if (sub_num > rows - 2)
+		sub_num = rows - 2;	// 多過ぎても意味ないので制限する
+#ifdef TIMER
+	// 使うスレッド数は、メイン・スレッドの分も含めるので 1個増える
+	printf("\nMaxtrix Inversion with %d threads\n", sub_num + 1);
+#endif
+
+	// サブ・スレッドを起動する
+	th->mat = mat;
+	th->cols = cols;
+	for (j = 0; j < sub_num; j++){	// サブ・スレッドごとに
+		// イベントを作成する
+		hRun[j] = CreateEvent(NULL, FALSE, FALSE, NULL);	// 両方とも Auto Reset にする
+		if (hRun[j] == NULL){
+			print_win32_err();
+			printf("error, inv-thread\n");
+			err = 1;
+			goto error_end;
+		}
+		hEnd[j] = CreateEvent(NULL, FALSE, FALSE, NULL);
+		if (hEnd[j] == NULL){
+			print_win32_err();
+			CloseHandle(hRun[j]);
+			printf("error, inv-thread\n");
+			err = 1;
+			goto error_end;
+		}
+		// サブ・スレッドを起動する
+		th->run = hRun[j];
+		th->end = hEnd[j];
+		//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを起動する
+		hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_func, (LPVOID)th, 0, NULL);
+		if (hSub[j] == NULL){
+			print_win32_err();
+			CloseHandle(hRun[j]);
+			CloseHandle(hEnd[j]);
+			printf("error, inv-thread\n");
+			err = 1;
+			goto error_end;
+		}
+		WaitForSingleObject(hEnd[j], INFINITE);	// 設定終了の合図を待つ (リセットする)
+	}
+
+	// Gaussian Elimination with 1 matrix
+	th->pivot = 0;
+	th->start = 0;	// その行の開始位置
+	for (th->skip = 0; th->skip < rows; th->skip++){
+		// 経過表示
+		if (GetTickCount() - time_last >= UPDATE_TIME){
+			if (print_progress((th->skip * 1000) / rows)){
+				err = 2;
+				goto error_end;
+			}
+			time_last = GetTickCount();
+		}
+
+		// その行 (パリティ・ブロック) がどのソース・ブロックの代用か
+		while ((th->pivot < cols) && (s_blk[th->pivot].exist != 0))
+			th->pivot++;
+
+		// Divide the row by element i,pivot
+		factor = mat[th->start + th->pivot];
+		if (factor > 1){
+			mat[th->start + th->pivot] = 1;	// これが行列を一個で済ます手
+			galois_region_divide(mat + th->start, cols, factor);
+		} else if (factor == 0){	// factor = 0 だと、その行列の逆行列を計算できない
+			err = (0x00010000 | th->pivot);	// どのソース・ブロックで問題が発生したのかを返す
+			goto error_end;
+		}
+
+		// 別の行の同じ pivot 列が 0以外なら、その値を 0にするために、
+		// i 行を何倍かしたものを XOR する
+		th->now = rows;	// 初期値 + 1
+		//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
+		for (j = 0; j < sub_num; j++)
+			SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
+		while ((j = InterlockedDecrement(&(th->now))) >= 0){	// j = --th_now
+			if (j == th->skip)	// 同じ行はとばす
+				continue;
+			row_start2 = cols * j;	// その行の開始位置
+			factor = mat[row_start2 + th->pivot];	// j 行の pivot 列の値
+			mat[row_start2 + th->pivot] = 0;	// これが行列を一個で済ます手
+			// 先の計算により、i 行の pivot 列の値は必ず 1なので、この factor が倍率になる
+			galois_region_multiply(mat + th->start, mat + row_start2, cols, factor);
+		}
+
+		WaitForMultipleObjects(sub_num, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
+		th->start += cols;
+		th->pivot++;
+	}
+
+error_end:
+	InterlockedExchange(&(th->skip), -1);		// 終了指示
+	for (j = 0; j < sub_num; j++){
+		if (hSub[j]){	// サブ・スレッドを終了させる
+			SetEvent(hRun[j]);
+			WaitForSingleObject(hSub[j], INFINITE);
+			CloseHandle(hSub[j]);
+		}
+	}
+	return err;
+}
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 /*
@@ -539,11 +718,9 @@ unsigned int time_total = GetTickCount();
 	}
 
 	// パリティ計算用の行列演算の準備をする
-	if (parity_num > source_num){
-		len = sizeof(unsigned short) * (source_num + parity_num);
-	} else {
-		len = sizeof(unsigned short) * source_num * 2;
-	}
+	len = sizeof(unsigned short) * source_num;
+	if (OpenCL_method != 0)
+		len *= 2;	// GPU の作業領域も確保しておく
 	constant = malloc(len);
 	if (constant == NULL){
 		printf("malloc, %d\n", len);
@@ -551,7 +728,11 @@ unsigned int time_total = GetTickCount();
 		goto error_end;
 	}
 #ifdef TIMER
-	printf("\nmatrix size = %d.%d KB\n", len >> 10, (len >> 10) % 10);
+	if (len & 0xFFFFF000){
+		printf("\nmatrix size = %u KB\n", len >> 10);
+	} else {
+		printf("\nmatrix size = %u Bytes\n", len);
+	}
 #endif
 	// パリティ検査行列の基になる定数
 	make_encode_constant(constant);
@@ -623,11 +804,9 @@ unsigned int time_total = GetTickCount();
 	}
 
 	// パリティ計算用の行列演算の準備をする
-	if (parity_num > source_num){
-		len = sizeof(unsigned short) * (source_num + parity_num);
-	} else {
-		len = sizeof(unsigned short) * source_num * 2;
-	}
+	len = sizeof(unsigned short) * source_num;
+	if (OpenCL_method != 0)
+		len *= 2;	// GPU の作業領域も確保しておく
 	constant = malloc(len);
 	if (constant == NULL){
 		printf("malloc, %d\n", len);
@@ -635,7 +814,11 @@ unsigned int time_total = GetTickCount();
 		goto error_end;
 	}
 #ifdef TIMER
-	printf("\nmatrix size = %d.%d KB\n", len >> 10, (len >> 10) % 10);
+	if (len & 0xFFFFF000){
+		printf("\nmatrix size = %u KB\n", len >> 10);
+	} else {
+		printf("\nmatrix size = %u Bytes\n", len);
+	}
 #endif
 	// パリティ検査行列の基になる定数
 	make_encode_constant(constant);
@@ -719,9 +902,11 @@ unsigned int time_matrix = 0, time_total = GetTickCount();
 	}
 #ifdef TIMER
 	if (len & 0xFFF00000){
-		printf("\nmatrix size = %d.%d MB\n", len >> 20, (len >> 20) % 10);
+		printf("\nmatrix size = %u MB\n", len >> 20);
+	} else if (len & 0x000FF000){
+		printf("\nmatrix size = %u KB\n", len >> 10);
 	} else {
-		printf("\nmatrix size = %d.%d KB\n", len >> 10, (len >> 10) % 10);
+		printf("\nmatrix size = %u Bytes\n", len);
 	}
 #endif
 	// 何番目の消失ソース・ブロックがどのパリティで代替されるか
@@ -783,7 +968,7 @@ time_matrix = GetTickCount() - time_matrix;
 		if (memory_use & 16){
 			err = -4;	// SSD なら Read all 方式でブロックが断片化しても速い
 		} else 
-		if (read_block_num(block_lost, 2, 0, MEM_UNIT) != 0){
+		if (read_block_num(block_lost, 0, MEM_UNIT) != 0){
 			err = -5;	// HDD でメモリーが足りてるなら Read some 方式を使う
 		} else {
 			err = -4;	// メモリー不足なら Read all 方式でブロックを断片化させる
@@ -793,7 +978,7 @@ time_matrix = GetTickCount() - time_matrix;
 		if (memory_use & 16){
 			err = -2;	// SSD なら Read all 方式でブロックが断片化しても速い
 		} else 
-		if (read_block_num(block_lost, cpu_num - 1, 0, sse_unit) != 0){
+		if (read_block_num(block_lost, 0, sse_unit) != 0){
 			err = -3;	// HDD でメモリーが足りてるなら Read some 方式を使う
 		} else {
 			err = -2;	// メモリー不足なら Read all 方式でブロックを断片化させる
diff --git a/source/par2j/reedsolomon.h b/source/par2j/reedsolomon.h
index 4c30a92..5f62cfb 100644
--- a/source/par2j/reedsolomon.h
+++ b/source/par2j/reedsolomon.h
@@ -10,7 +10,6 @@ extern "C" {
 
 // Read all source & Keep some parity 方式
 // 部分的なエンコードを行う最低ブロック数
-#define PART_MAX_RATE	1	// ソース・ブロック数の 1/2  = 50%
 #define PART_MIN_RATE	5	// ソース・ブロック数の 1/32 = 3.1%
 
 // Read some source & Keep all parity 方式
@@ -33,7 +32,6 @@ unsigned int get_io_size(
 // 何ブロックまとめてファイルから読み込むかを空きメモリー量から計算する
 int read_block_num(
 	int keep_num,			// 保持するパリティ・ブロック数
-	int add_num,			// 余裕を見るブロック数
 	size_t trial_alloc,		// 確保できるか確認するのか
 	int alloc_unit);		// メモリー単位の境界 (sse_unit か MEM_UNIT)
 
diff --git a/source/par2j/res_par2j.rc b/source/par2j/res_par2j.rc
index 4cfdeec..c1008f4 100644
--- a/source/par2j/res_par2j.rc
+++ b/source/par2j/res_par2j.rc
@@ -1,8 +1,8 @@
 1 RT_STRING ".\\source.cl"
 
 1 VERSIONINFO
-FILEVERSION 1,3,2,8
-PRODUCTVERSION 1,3,2,0
+FILEVERSION 1,3,3,0
+PRODUCTVERSION 1,3,3,0
 FILEOS 0x40004
 FILETYPE 0x1
 {
@@ -13,8 +13,8 @@ BLOCK "StringFileInfo"
 		VALUE "FileDescription", "PAR2 client"
 		VALUE "LegalCopyright", "Copyright (C) 2023 Yutaka Sawada"
 		VALUE "ProductName", "par2j"
-		VALUE "FileVersion", "1.3.2.8"
-		VALUE "ProductVersion", "1.3.2.0"
+		VALUE "FileVersion", "1.3.3.0"
+		VALUE "ProductVersion", "1.3.3.0"
 	}
 }
 
diff --git a/source/par2j/rs_decode.c b/source/par2j/rs_decode.c
index e9b0606..29b63b7 100644
--- a/source/par2j/rs_decode.c
+++ b/source/par2j/rs_decode.c
@@ -1,5 +1,5 @@
 ﻿// rs_decode.c
-// Copyright : 2022-10-08 Yutaka Sawada
+// Copyright : 2023-09-21 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -39,6 +39,7 @@ typedef struct {	// RS threading control struct
 	unsigned short * volatile mat;	// 行列
 	unsigned char * volatile buf;
 	volatile unsigned int size;		// バイト数
+	volatile unsigned int len;
 	volatile int count;
 	volatile int off;
 	volatile int now;
@@ -51,8 +52,9 @@ static DWORD WINAPI thread_decode2(LPVOID lpParameter)
 {
 	unsigned char *s_buf, *p_buf, *work_buf;
 	unsigned short *factor, *factor2;
-	int i, j, src_start, src_num, max_num;
-	int chunk_num, part_start, part_num, cover_num;
+	int i, j, max_num, chunk_num;
+	int part_off, part_num, part_now;
+	int src_off, src_num;
 	unsigned int unit_size, len, off, chunk_size;
 	HANDLE hRun, hEnd;
 	RS_TH *th;
@@ -64,7 +66,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 	th = (RS_TH *)lpParameter;
 	p_buf = th->buf;
 	unit_size = th->size;
-	chunk_size = th->off;
+	chunk_size = th->len;
 	part_num = th->count;
 	hRun = th->run;
 	hEnd = th->end;
@@ -80,21 +82,14 @@ time_start2 = GetTickCount();
 #endif
 		s_buf = th->buf;
 		factor = th->mat;
-		len = chunk_size;
+		src_off = th->off;	// ソース・ブロック番号
 
 		if (th->size == 0){	// ソース・ブロック読み込み中
-			src_start = th->off;	// ソース・ブロック番号
-			max_num = chunk_num * part_num;
 			// パリティ・ブロックごとに掛け算して追加していく
-			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
-				off = j / part_num;	// chunk の番号
-				j = j % part_num;	// lost block の番号
-				off *= chunk_size;
-				if (off + len > unit_size)
-					len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
-				if (src_start == 0)	// 最初のブロックを計算する際に
-					memset(p_buf + ((size_t)unit_size * j + off), 0, len);	// ブロックを 0で埋める
-				galois_align_multiply(s_buf + off, p_buf + ((size_t)unit_size * j + off), len, factor[source_num * j]);
+			while ((j = InterlockedIncrement(&(th->now))) < part_num){	// j = ++th_now
+				if (src_off == 0)	// 最初のブロックを計算する際に
+					memset(p_buf + (size_t)unit_size * j, 0, unit_size);	// ブロックを 0で埋める
+				galois_align_multiply(s_buf, p_buf + (size_t)unit_size * j, unit_size, factor[source_num * j]);
 #ifdef TIMER
 loop_count2a++;
 #endif
@@ -104,24 +99,38 @@ time_encode2a += GetTickCount() - time_start2;
 #endif
 		} else {	// 消失ブロックを部分的に保持する場合
 			// スレッドごとに復元する消失ブロックの chunk を変える
-			src_num = source_num - th->off;
-			cover_num = th->size;
-			part_start = th->count;
-			max_num = chunk_num * cover_num;
+			src_num = th->len;
+			part_now = th->size;
+			part_off = th->count;
+			len = chunk_size;
+			max_num = chunk_num * part_now;
 			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
-				off = j / cover_num;	// chunk の番号
-				j = j % cover_num;		// lost block の番号
-				off *= chunk_size;		// chunk の位置
+				off = j / part_now;	// chunk の番号
+				j = j % part_now;	// lost block の番号
+				off *= chunk_size;	// chunk の位置
 				if (off + len > unit_size)
 					len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
 				work_buf = p_buf + (size_t)unit_size * j + off;
-				if (part_start != 0)
-					memset(work_buf, 0, len);	// 最初の part_num 以降は 2nd encode だけなので 0で埋める
-				factor2 = factor + source_num * (part_start + j);
+				if (src_off == 0)	// 最初のブロックを計算する際に
+					memset(work_buf, 0, len);	// パリティ・ブロックを 0で埋める
+				factor2 = factor + source_num * (part_off + j);
 
 				// ソース・ブロックごとにパリティを追加していく
-				for (i = 0; i < src_num; i++)
-					galois_align_multiply(s_buf + ((size_t)unit_size * i + off), work_buf, len, factor2[i]);
+				if (galois_align_multiply2 != NULL){	// ２ブロックずつ計算する場合 (SSSE3 か AVX2)
+					i = 0;
+					if (src_num & 1){	// 奇数なら最初の一個を計算して、残りを偶数に変える
+						galois_align_multiply(s_buf + (size_t)unit_size * i + off, work_buf, len, factor2[i]);
+						i++;
+					}
+					for (; i < src_num; i += 2){
+						galois_align_multiply2(s_buf + (size_t)unit_size * i + off, s_buf + (size_t)unit_size * (i + 1) + off,
+									work_buf, len, factor2[i], factor2[i + 1]);
+					}
+
+				} else {	// 一つずつ計算する場合
+					for (i = 0; i < src_num; i++)
+						galois_align_multiply(s_buf + (size_t)unit_size * i + off, work_buf, len, factor2[i]);
+				}
 #ifdef TIMER
 loop_count2b += src_num;
 #endif
@@ -135,8 +144,7 @@ time_encode2b += GetTickCount() - time_start2;
 		WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	}
 #ifdef TIMER
-loop_count2a /= chunk_num;	// chunk数で割ってブロック数にする
-loop_count2b /= chunk_num;
+loop_count2b /= chunk_num;	// chunk数で割ってブロック数にする
 printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
 if (time_encode2a > 0){
 	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
@@ -163,7 +171,8 @@ static DWORD WINAPI thread_decode3(LPVOID lpParameter)
 {
 	unsigned char *s_buf, *p_buf, *work_buf;
 	unsigned short *factor, *factor2;
-	int i, j, block_lost, src_start, src_num, max_num, chunk_num;
+	int i, j, block_lost, max_num, chunk_num;
+	int src_off, src_num;
 	unsigned int unit_size, len, off, chunk_size;
 	HANDLE hRun, hEnd;
 	RS_TH *th;
@@ -175,7 +184,7 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 	th = (RS_TH *)lpParameter;
 	p_buf = th->buf;
 	unit_size = th->size;
-	chunk_size = th->off;
+	chunk_size = th->len;
 	block_lost = th->count;
 	hRun = th->run;
 	hEnd = th->end;
@@ -192,20 +201,14 @@ time_start2 = GetTickCount();
 #endif
 		s_buf = th->buf;
 		factor = th->mat;
-		len = chunk_size;
 
 		if (th->size == 0){	// ソース・ブロック読み込み中
-			src_start = th->off;	// ソース・ブロック番号
 			// パリティ・ブロックごとに掛け算して追加していく
-			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
-				off = j / block_lost;	// chunk の番号
-				j = j % block_lost;		// lost block の番号
-				off *= chunk_size;
-				if (off + len > unit_size)
-					len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
-				if (src_start == 0)	// 最初のブロックを計算する際に
-					memset(p_buf + ((size_t)unit_size * j + off), 0, len);	// ブロックを 0で埋める
-				galois_align_multiply(s_buf + off, p_buf + ((size_t)unit_size * j + off), len, factor[source_num * j]);
+			src_off = th->off;	// ソース・ブロック番号
+			while ((j = InterlockedIncrement(&(th->now))) < block_lost){	// j = ++th_now
+				if (src_off == 0)	// 最初のブロックを計算する際に
+					memset(p_buf + (size_t)unit_size * j, 0, unit_size);	// ブロックを 0で埋める
+				galois_align_multiply(s_buf, p_buf + (size_t)unit_size * j, unit_size, factor[source_num * j]);
 #ifdef TIMER
 loop_count2a++;
 #endif
@@ -216,6 +219,7 @@ time_encode2a += GetTickCount() - time_start2;
 		} else {	// 全ての消失ブロックを保持する場合
 			// スレッドごとに復元する消失ブロックの chunk を変える
 			src_num = th->size;
+			len = chunk_size;
 			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
 				off = j / block_lost;	// chunk の番号
 				j = j % block_lost;		// lost block の番号
@@ -226,8 +230,21 @@ time_encode2a += GetTickCount() - time_start2;
 				factor2 = factor + source_num * j;
 
 				// ソース・ブロックごとにパリティを追加していく
-				for (i = 0; i < src_num; i++)
-					galois_align_multiply(s_buf + ((size_t)unit_size * i + off), work_buf, len, factor2[i]);
+				if (galois_align_multiply2 != NULL){	// ２ブロックずつ計算する場合 (SSSE3 か AVX2)
+					i = 0;
+					if (src_num & 1){	// 奇数なら最初の一個を計算して、残りを偶数に変える
+						galois_align_multiply(s_buf + (size_t)unit_size * i + off, work_buf, len, factor2[i]);
+						i++;
+					}
+					for (; i < src_num; i += 2){
+						galois_align_multiply2(s_buf + (size_t)unit_size * i + off, s_buf + (size_t)unit_size * (i + 1) + off,
+									work_buf, len, factor2[i], factor2[i + 1]);
+					}
+
+				} else {	// 一つずつ計算する場合
+					for (i = 0; i < src_num; i++)
+						galois_align_multiply(s_buf + (size_t)unit_size * i + off, work_buf, len, factor2[i]);
+				}
 #ifdef TIMER
 loop_count2b += src_num;
 #endif
@@ -241,8 +258,7 @@ time_encode2b += GetTickCount() - time_start2;
 		WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	}
 #ifdef TIMER
-loop_count2a /= chunk_num;	// chunk数で割ってブロック数にする
-loop_count2b /= chunk_num;
+loop_count2b /= chunk_num;	// chunk数で割ってブロック数にする
 printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
 if (time_encode2a > 0){
 	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
@@ -265,124 +281,14 @@ printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time
 	return 0;
 }
 
-// ブロックごとに計算するためのスレッド
-static DWORD WINAPI thread_decode_each(LPVOID lpParameter)
-{
-	unsigned char *s_buf, *p_buf, *work_buf;
-	unsigned short *factor, *factor2;
-	int i, j, th_id, block_lost, src_start, src_num, max_num;
-	unsigned int unit_size, len, off, chunk_size;
-	HANDLE hRun, hEnd;
-	RS_TH *th;
-#ifdef TIMER
-unsigned int loop_count2a = 0, loop_count2b = 0;
-unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
-#endif
-
-	th = (RS_TH *)lpParameter;
-	p_buf = th->buf;
-	unit_size = th->size;
-	chunk_size = th->off;
-	block_lost = th->count;
-	th_id = th->now;	// スレッド番号
-	hRun = th->run;
-	hEnd = th->end;
-	//_mm_sfence();
-	SetEvent(hEnd);	// 設定完了を通知する
-
-	max_num = ((unit_size + chunk_size - 1) / chunk_size) * block_lost;
-
-	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
-	while (th->now < INT_MAX / 2){
-#ifdef TIMER
-time_start2 = GetTickCount();
-#endif
-		s_buf = th->buf;
-		factor = th->mat;
-
-		if (th->size == 0xFFFFFFFF){	// ソース・ブロック読み込み中
-			src_start = th->off;	// ソース・ブロック番号
-			len = chunk_size;
-			// パリティ・ブロックごとに掛け算して追加していく
-			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
-				off = j / block_lost;	// chunk の番号
-				j = j % block_lost;		// lost block の番号
-				off *= chunk_size;
-				if (off + len > unit_size)
-					len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
-				if (src_start == 0)	// 最初のブロックを計算する際に
-					memset(p_buf + ((size_t)unit_size * j + off), 0, len);	// ブロックを 0で埋める
-				galois_align_multiply(s_buf + off, p_buf + ((size_t)unit_size * j + off), len, factor[source_num * j]);
-#ifdef TIMER
-loop_count2a++;
-#endif
-			}
-#ifdef TIMER
-time_encode2a += GetTickCount() - time_start2;
-#endif
-		} else {
-			// スレッドごとに復元する消失ブロックを変える
-			src_num = th->count;
-			while ((j = InterlockedIncrement(&(th->now))) < block_lost){	// j = ++th_now
-				work_buf = p_buf + (size_t)unit_size * j;
-				factor2 = factor + source_num * j;
-
-				// chunk に分割して計算する
-				len = chunk_size;
-				off = 0;
-				while (off < unit_size){
-					// ソース・ブロックごとにパリティを追加していく
-					for (i = 0; i < src_num; i++)
-						galois_align_multiply(s_buf + ((size_t)unit_size * i + off), work_buf, len, factor2[i]);
-
-					work_buf += len;
-					off += len;
-					if (off + len > unit_size)
-						len = unit_size - off;
-				}
-#ifdef TIMER
-loop_count2b += src_num;
-#endif
-			}
-#ifdef TIMER
-time_encode2b += GetTickCount() - time_start2;
-#endif
-		}
-		//_mm_sfence();	// メモリーへの書き込みを完了する
-		SetEvent(hEnd);	// 計算終了を通知する
-		WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
-	}
-#ifdef TIMER
-loop_count2a /= (unit_size + chunk_size - 1) / chunk_size;	// chunk数で割ってブロック数にする
-printf("sub-thread[%d] : total loop = %d\n", th_id, loop_count2a + loop_count2b);
-if (time_encode2a > 0){
-	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
-} else {
-	i = 0;
-}
-if (loop_count2a > 0)
-	printf(" 1st decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i);
-if (time_encode2b > 0){
-	i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072));
-} else {
-	i = 0;
-}
-printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i);
-#endif
-
-	// 終了処理
-	CloseHandle(hRun);
-	CloseHandle(hEnd);
-	return 0;
-}
-
-// GPU 対応のサブ・スレッド (スレッド番号は最後になる)
+// GPU 対応のサブ・スレッド (最後のスレッドなので、1st decode では呼ばれない)
 static DWORD WINAPI thread_decode_gpu(LPVOID lpParameter)
 {
 	unsigned char *s_buf, *p_buf;
 	unsigned short *factor;
-	int i, j, block_lost, src_num;
-	unsigned int unit_size;
+	int i, j, block_lost, max_num, chunk_num;
+	int src_num;
+	unsigned int unit_size, len, off, chunk_size;
 	HANDLE hRun, hEnd;
 	RS_TH *th;
 #ifdef TIMER
@@ -392,12 +298,16 @@ unsigned int time_start2, time_encode2 = 0, loop_count2 = 0;
 	th = (RS_TH *)lpParameter;
 	p_buf = th->buf;
 	unit_size = th->size;
+	chunk_size = th->len;
 	block_lost = th->count;
 	hRun = th->run;
 	hEnd = th->end;
 	//_mm_sfence();
 	SetEvent(hEnd);	// 設定完了を通知する
 
+	chunk_num = (unit_size + chunk_size - 1) / chunk_size;
+	max_num = chunk_num * block_lost;
+
 	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	while (th->now < INT_MAX / 2){
 #ifdef TIMER
@@ -406,23 +316,32 @@ time_start2 = GetTickCount();
 		// GPUはソース・ブロック読み込み中に呼ばれない
 		s_buf = th->buf;
 		factor = th->mat;
-		src_num = th->count;
+		src_num = th->size;
 
 		// 最初にソース・ブロックをVRAMへ転送する
 		i = gpu_copy_blocks(s_buf, unit_size, src_num);
 		if (i != 0){
-			th->size = i;
-			InterlockedExchange(&(th->now), INT_MAX / 2);	// サブ・スレッドの計算を中断する
+			th->len = i;
+			InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
 		}
 
-		// スレッドごとに復元する消失ブロックを変える
-		while ((j = InterlockedIncrement(&(th->now))) < block_lost){	// j = ++th_now
-			// 倍率は逆行列から部分的にコピーする
-			i = gpu_multiply_blocks(src_num, factor + source_num * j, p_buf + (size_t)unit_size * j, unit_size);
+		// スレッドごとに復元する消失ブロックの chunk を変える
+		len = chunk_size;
+		while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
+			off = j / block_lost;	// chunk の番号
+			j = j % block_lost;		// lost block の番号
+			off *= chunk_size;		// chunk の位置
+			if (off + len > unit_size)
+				len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
+
+			// VRAM上のソース・ブロックごとにパリティを追加していく
+			i = gpu_multiply_chunks(src_num, factor + source_num * j, p_buf + (size_t)unit_size * j + off, off, len);
 			if (i != 0){
-				th->size = i;
+				th->len = i;
+				InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
 				break;
 			}
+
 #ifdef TIMER
 loop_count2 += src_num;
 #endif
@@ -431,20 +350,23 @@ loop_count2 += src_num;
 time_encode2 += GetTickCount() - time_start2;
 #endif
 		// 最後にVRAMを解放する
-		th->size = gpu_finish();
+		i = gpu_finish();
+		if ((i != 0) && (th->len == 0))
+			th->len = i;	// 初めてエラーが発生した時だけセットする
 
 		//_mm_sfence();	// メモリーへの書き込みを完了する
 		SetEvent(hEnd);	// 計算終了を通知する
 		WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	}
 #ifdef TIMER
-printf("gpu-thread    : total loop = %d\n", loop_count2);
+loop_count2 /= chunk_num;	// chunk数で割ってブロック数にする
+printf("gpu-thread :\n");
 if (time_encode2 > 0){
 	i = (int)((__int64)loop_count2 * unit_size * 125 / ((__int64)time_encode2 * 131072));
 } else {
 	i = 0;
 }
-printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2 / 1000, time_encode2 % 1000, loop_count2, i);
+printf(" 2nd decode %d.%03d sec, %d loop, %d MB/s\n", time_encode2 / 1000, time_encode2 % 1000, loop_count2, i);
 #endif
 
 	// 終了処理
@@ -488,7 +410,7 @@ int decode_method1(	// ソース・ブロックが一個だけの場合
 	printf("\n read one block, and keep one recovering block\n");
 	printf("buffer size = %d MB, io_size = %d, split = %d\n", len >> 20, io_size, (block_size + io_size - 1) / io_size);
 	id = try_cache_blocking(unit_size);
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, id, (unit_size + id - 1) / id);
+	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_flag & 0x7FFF0000, id, (unit_size + id - 1) / id);
 #endif
 
 	// 書き込み先のファイルを開く
@@ -615,10 +537,11 @@ int decode_method2(	// ソース・データを全て読み込む場合
 {
 	unsigned char *buf = NULL, *p_buf, *work_buf, *hash;
 	unsigned short *id;
-	int err = 0, i, j, last_file, part_start, part_num, recv_now;
-	int src_num, chunk_num, cover_num;
+	int err = 0, i, j, last_file, chunk_num;
+	int part_off, part_num, part_now, recv_now;
+	int cpu_num1, src_off, src_num, src_max;
 	unsigned int io_size, unit_size, len, block_off;
-	unsigned int time_last, prog_write;
+	unsigned int time_last, prog_read, prog_write;
 	__int64 file_off, prog_num = 0, prog_base;
 	HANDLE hFile = NULL;
 	HANDLE hSub[MAX_CPU], hRun[MAX_CPU], hEnd[MAX_CPU];
@@ -628,15 +551,9 @@ int decode_method2(	// ソース・データを全て読み込む場合
 	id = mat + (block_lost * source_num);	// 何番目の消失ソース・ブロックがどのパリティで代替されるか
 
 	// 作業バッファーを確保する
-	part_num = source_num >> PART_MAX_RATE;	// ソース・ブロック数に対する割合で最大量を決める
+	part_num = block_lost;	// 最大値を初期値にする
+	//part_num = (block_lost + 1) / 2;	// 確保量の実験用
 	//part_num = (block_lost + 2) / 3;	// 確保量の実験用
-	if (part_num < block_lost){	// 分割して計算するなら
-		i = (block_lost + part_num - 1) / part_num;	// 分割回数
-		part_num = (block_lost + i - 1) / i;
-		part_num = ((part_num + cpu_num - 1) / cpu_num) * cpu_num;	// cpu_num の倍数にする（切り上げ）
-	}
-	if (part_num > block_lost)
-		part_num = block_lost;
 	io_size = get_io_size(source_num, &part_num, 1, sse_unit);
 	//io_size = (((io_size + 1) / 2 + HASH_SIZE + (sse_unit - 1)) & ~(sse_unit - 1)) - HASH_SIZE;	// 2分割の実験用
 	//io_size = (((io_size + 2) / 3 + HASH_SIZE + (sse_unit - 1)) & ~(sse_unit - 1)) - HASH_SIZE;	// 3分割の実験用
@@ -648,28 +565,39 @@ int decode_method2(	// ソース・データを全て読み込む場合
 		err = 1;
 		goto error_end;
 	}
+	//memset(buf, 0xFF, (size_t)file_off);	// 後から 0 埋めしてるかの実験用
 	p_buf = buf + (size_t)unit_size * source_num;	// 復元したブロックを記録する領域
 	hash = p_buf + (size_t)unit_size * part_num;
 	prog_base = (block_size + io_size - 1) / io_size;
-	prog_write = source_num >> 5;	// 計算で 97%、書き込みで 3% ぐらい
-	if (prog_write == 0)
-		prog_write = 1;
-	prog_base *= (__int64)(source_num + prog_write) * block_lost;	// 全体の断片の個数
+	prog_read = (block_lost + 31) / 32;	// 読み書きの経過をそれぞれ 3% ぐらいにする
+	prog_write = (source_num + 31) / 32;
+	prog_base *= (__int64)(source_num + prog_write) * block_lost + prog_read * source_num;	// 全体の断片の個数
 	len = try_cache_blocking(unit_size);
 	//len = ((len + 2) / 3 + (sse_unit - 1)) & ~(sse_unit - 1);	// 1/3の実験用
 	chunk_num = (unit_size + len - 1) / len;
+	cpu_num1 = 0;	// 読み込み中はスレッド数を減らす（シングル・スレッドの時は 0にする）
+	i = 1;
+	while (i * 2 <= cpu_num){	// 1=0, 2~3=1, 4~7=2, 8~15=3, 16~31=4, 32=5
+		cpu_num1++;
+		i *= 2;
+	}
+	if (cpu_num1 > part_num)
+		cpu_num1 = part_num;
+	src_max = cpu_cache & 0xFFFE;	// CPU cache 最適化のため、同時に処理するブロック数を制限する
+	if ((src_max < 8) || (cpu_num == 1))
+		src_max = 0x8000;	// 不明または少な過ぎる場合は、制限しない
 #ifdef TIMER
 	printf("\n read all blocks, and keep some recovering blocks\n");
 	printf("buffer size = %I64d MB, io_size = %d, split = %d\n", file_off >> 20, io_size, (block_size + io_size - 1) / io_size);
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, len, chunk_num);
-	printf("prog_base = %I64d, unit_size = %d\n", prog_base, unit_size);
+	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_flag & 0x7FFF0000, len, chunk_num);
+	printf("unit_size = %d, part_num = %d, cpu_num1 = %d, src_max = %d\n", unit_size, part_num, cpu_num1, src_max);
 #endif
 
 	// マルチ・スレッドの準備をする
 	th->buf = p_buf;
 	th->size = unit_size;
 	th->count = part_num;
-	th->off = len;	// キャッシュの最適化を試みる
+	th->len = len;	// キャッシュの最適化を試みる
 	for (j = 0; j < cpu_num; j++){	// サブ・スレッドごとに
 		hRun[j] = CreateEvent(NULL, FALSE, FALSE, NULL);	// Auto Reset にする
 		if (hRun[j] == NULL){
@@ -701,8 +629,6 @@ int decode_method2(	// ソース・データを全て読み込む場合
 		}
 		WaitForSingleObject(hEnd[j], INFINITE);	// 設定終了の合図を待つ (リセットしない)
 	}
-	// IO が延滞しないように、サブ・スレッド一つの優先度を下げる
-	SetThreadPriority(hSub[0], THREAD_PRIORITY_BELOW_NORMAL);
 
 	// ブロック断片を読み込んで、消失ブロック断片を復元する
 	print_progress_text(0, "Recovering slice");
@@ -711,7 +637,7 @@ int decode_method2(	// ソース・データを全て読み込む場合
 	block_off = 0;
 	while (block_off < block_size){
 		th->size = 0;	// 1st decode
-		th->off = -1;	// まだ計算して無い印
+		src_off = -1;	// まだ計算して無い印
 
 #ifdef TIMER
 read_count = 0;
@@ -793,41 +719,60 @@ read_count++;
 				}
 			}
 
-			if ((len > 0) && (i + 1 < source_num)){	// 最後のブロック以外なら
-				// サブ・スレッドの動作状況を調べる
-				j = WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, 0);
-				if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
-					// 経過表示
-					prog_num += part_num;
-					if (GetTickCount() - time_last >= UPDATE_TIME){
-						if (print_progress((int)((prog_num * 1000) / prog_base))){
-							err = 2;
-							goto error_end;
+			if (len > 0){
+				if (src_off < 0){
+					src_num = i + 1;	// 最後のブロックより前なら
+				} else {
+					src_num = i / (src_off + 1);	// だいたい何ブロック読むごとに計算が終わるか
+					src_num += i + 1;	// 次のブロック番号を足す
+				}
+				if (src_num < source_num){	// 読み込みが終わる前に計算が終わりそうなら
+					// サブ・スレッドの動作状況を調べる
+					j = WaitForMultipleObjects(cpu_num1, hEnd, TRUE, 0);
+					if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
+						// 経過表示
+						prog_num += part_num;
+						if (GetTickCount() - time_last >= UPDATE_TIME){
+							if (print_progress((int)((prog_num * 1000) / prog_base))){
+								err = 2;
+								goto error_end;
+							}
+							time_last = GetTickCount();
 						}
-						time_last = GetTickCount();
-					}
-					// 計算終了したブロックの次から計算を開始する
-					th->off += 1;
-					if (th->off > 0){	// バッファーに読み込んだ時だけ計算する
-						while ((s_blk[th->off].exist != 0) &&
-								((s_blk[th->off].size <= block_off) || (s_blk[th->off].exist == 3))){
-							prog_num += part_num;
-							th->off += 1;
+						// 計算終了したブロックの次から計算を開始する
+						src_off += 1;
+						if (src_off > 0){	// バッファーに読み込んだ時だけ計算する
+							while ((s_blk[src_off].exist != 0) &&
+									((s_blk[src_off].size <= block_off) || (s_blk[src_off].exist == 3))){
+								prog_num += part_num;
+								src_off += 1;
 #ifdef TIMER
 skip_count++;
 #endif
+							}
+						}
+						th->buf = buf + (size_t)unit_size * src_off;
+						th->mat = mat + src_off;
+						th->off = src_off;
+						th->now = -1;	// 初期値 - 1
+						//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
+						for (j = 0; j < cpu_num1; j++){
+							ResetEvent(hEnd[j]);	// リセットしておく
+							SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 						}
 					}
-					th->buf = buf + (size_t)unit_size * th->off;
-					th->mat = mat + th->off;
-					th->now = -1;	// 初期値 - 1
-					//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
-					for (j = 0; j < (cpu_num + 1) / 2; j++){
-						ResetEvent(hEnd[j]);	// リセットしておく
-						SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
-					}
 				}
 			}
+
+			// 経過表示
+			prog_num += prog_read;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
 		}
 		if (hFile){	// 最後の読み込みファイルを閉じる
 			CloseHandle(hFile);
@@ -837,75 +782,93 @@ skip_count++;
 time_read += GetTickCount() - time_start;
 #endif
 
-		WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
-		th->off += 1;	// 計算を開始するソース・ブロックの番号
-		if (th->off > 0){	// 計算不要なソース・ブロックはとばす
-			while ((s_blk[th->off].exist != 0) &&
-					((s_blk[th->off].size <= block_off) || (s_blk[th->off].exist == 3))){
+		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
+		src_off += 1;	// 計算を開始するソース・ブロックの番号
+		if (src_off > 0){	// 計算不要なソース・ブロックはとばす
+			while ((s_blk[src_off].exist != 0) &&
+					((s_blk[src_off].size <= block_off) || (s_blk[src_off].exist == 3))){
 				prog_num += part_num;
-				th->off += 1;
+				src_off += 1;
 #ifdef TIMER
 skip_count++;
 #endif
 			}
-		} else {	// エラーや実験時以外は th->off は 0 にならない
-			memset(p_buf, 0, (size_t)unit_size * part_num);
 		}
+		// 1st decode しなかった場合（src_off = 0）は、2nd decode で消失ブロックをゼロ埋めする
 #ifdef TIMER
-		j = (th->off * 1000) / source_num;
-		printf("partial decode = %d (%d.%d%%), read = %d, skip = %d\n", th->off, j / 10, j % 10, read_count, skip_count);
+		j = (src_off * 1000) / source_num;
+		printf("partial decode = %d / %d (%d.%d%%), read = %d, skip = %d\n", src_off, source_num, j / 10, j % 10, read_count, skip_count);
 #endif
 		recv_now = -1;	// 消失ブロックの本来のソース番号
 		last_file = -1;
 
-		// cover_num ごとに処理する
-		part_start = 0;
-		cover_num = part_num;	// part_num は cpu_num の倍数にすること
-		src_num = source_num - th->off;	// 一度に処理する量 (src_num > 0)
-		th->buf = buf + (size_t)unit_size * (th->off);
-		while (part_start < block_lost){
-			if (part_start == part_num){	// part_num 分の計算が終わったら
-				th->off = 0;	// 最初の計算以降は全てのソース・ブロックを対象にする
-				src_num = source_num;	// source_num - th->off
-				th->buf = buf;	// buf + (size_t)unit_size * (th->off);
-			}
-			if (part_start + cover_num > block_lost)
-				cover_num = block_lost - part_start;
-			//printf("part_start = %d, src_num = %d / %d, cover_num = %d\n", part_start, src_num, source_num, cover_num);
+		// part_now ごとに処理する
+		part_off = 0;
+		part_now = part_num;
+		while (part_off < block_lost){
+			if (part_off + part_now > block_lost)
+				part_now = block_lost - part_off;
 
 			// スレッドごとに消失ブロックを計算する
-			th->mat = mat + (th->off);
-			th->size = cover_num;
-			th->count = part_start;
-			th->now = -1;	// 初期値 - 1
-			//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
-			for (j = 0; j < cpu_num; j++){
-				ResetEvent(hEnd[j]);	// リセットしておく
-				SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
-			}
+			th->count = part_off;
+			th->size = part_now;
+			if (part_off > 0)
+				src_off = 0;	// 最初の計算以降は全てのソース・ブロックを対象にする
+			src_num = src_max;	// 一度に処理するソース・ブロックの数を制限する
+#ifdef TIMER
+			printf("part_off = %d, part_now = %d, src_off = %d\n", part_off, part_now, src_off);
+#endif
+			while (src_off < source_num){
+				// ソース・ブロックを何個ずつ処理するか
+				if (src_off + src_num * 2 - 1 >= source_num)
+					src_num = source_num - src_off;
+				//printf("src_off = %d, src_num = %d\n", src_off, src_num);
 
-			// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待つ
-			while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
-				// th-now が最高値なので、計算が終わってるのは th-now - cpu_num 個となる
-				j = th->now - cpu_num;
-				if (j < 0)
-					j = 0;
-				j /= chunk_num;	// chunk数で割ってブロック数にする
-				// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
-				if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
-					err = 2;
-					goto error_end;
+				th->mat = mat + src_off;
+				th->buf = buf + (size_t)unit_size * src_off;
+				th->off = src_off;
+				th->len = src_num;
+				th->now = -1;	// 初期値 - 1
+				//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
+				for (j = 0; j < cpu_num; j++){
+					ResetEvent(hEnd[j]);	// リセットしておく
+					SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 				}
-				time_last = GetTickCount();
+
+				// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待つ
+				while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
+					// th-now が最高値なので、計算が終わってるのは th-now + 1 - cpu_num 個となる
+					j = th->now + 1 - cpu_num;
+					if (j < 0)
+						j = 0;
+					j /= chunk_num;	// chunk数で割ってブロック数にする
+					// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
+					if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
+						err = 2;
+						goto error_end;
+					}
+					time_last = GetTickCount();
+				}
+
+				// 経過表示
+				prog_num += src_num * part_now;
+				if (GetTickCount() - time_last >= UPDATE_TIME){
+					if (print_progress((int)((prog_num * 1000) / prog_base))){
+						err = 2;
+						goto error_end;
+					}
+					time_last = GetTickCount();
+				}
+
+				src_off += src_num;
 			}
-			prog_num += src_num * cover_num;
 
 #ifdef TIMER
 time_start = GetTickCount();
 #endif
 			// 復元されたブロックを書き込む
 			work_buf = p_buf;
-			for (i = part_start; i < part_start + cover_num; i++){
+			for (i = part_off; i < part_off + part_now; i++){
 				for (j = recv_now + 1; j < source_num; j++){	// 何番のソース・ブロックか
 					if (s_blk[j].exist == 0){
 						recv_now = j;
@@ -975,7 +938,7 @@ write_count++;
 time_write += GetTickCount() - time_start;
 #endif
 
-			part_start += part_num;	// 次の消失ブロック位置にする
+			part_off += part_num;	// 次の消失ブロック位置にする
 		}
 
 		block_off += io_size;
@@ -984,12 +947,13 @@ time_write += GetTickCount() - time_start;
 		hFile = NULL;
 	}
 	print_progress_done();
-	//printf("prog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 #ifdef TIMER
 printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
 j = ((block_size + io_size - 1) / io_size) * block_lost;
 printf("write  %d.%03d sec, count = %d/%d\n", time_write / 1000, time_write % 1000, write_count, j);
+if (prog_num != prog_base)
+	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
 
 error_end:
@@ -1019,10 +983,11 @@ int decode_method3(	// 復元するブロックを全て保持できる場合
 {
 	unsigned char *buf = NULL, *p_buf, *work_buf, *hash;
 	unsigned short *id;
-	int err = 0, i, j, last_file, source_off, read_num, recv_now, parity_now;
-	int src_num, chunk_num;
+	int err = 0, i, j, last_file, chunk_num;
+	int source_off, read_num, recv_now, parity_now;
+	int cpu_num1, src_off, src_num, src_max;
 	unsigned int unit_size, len;
-	unsigned int time_last, prog_write;
+	unsigned int time_last, prog_read, prog_write;
 	__int64 file_off, prog_num = 0, prog_base;
 	HANDLE hFile = NULL;
 	HANDLE hSub[MAX_CPU], hRun[MAX_CPU], hEnd[MAX_CPU];
@@ -1033,7 +998,7 @@ int decode_method3(	// 復元するブロックを全て保持できる場合
 	unit_size = (block_size + HASH_SIZE + (sse_unit - 1)) & ~(sse_unit - 1);	// チェックサムの分だけ増やす
 
 	// 作業バッファーを確保する
-	read_num = read_block_num(block_lost, 0, 1, sse_unit);	// ソース・ブロックを何個読み込むか
+	read_num = read_block_num(block_lost, 1, sse_unit);	// ソース・ブロックを何個読み込むか
 	if (read_num == 0){
 		//printf("cannot keep enough blocks, use another method\n");
 		return -2;	// スライスを分割して処理しないと無理
@@ -1047,26 +1012,37 @@ int decode_method3(	// 復元するブロックを全て保持できる場合
 		err = 1;
 		goto error_end;
 	}
+	//memset(buf, 0xFF, (size_t)file_off);	// 後から 0 埋めしてるかの実験用
 	p_buf = buf + (size_t)unit_size * read_num;	// パリティ・ブロックを記録する領域
 	hash = p_buf + (size_t)unit_size * block_lost;
-	prog_write = source_num >> 5;	// 計算で 97%、書き込みで 3% ぐらい
-	if (prog_write == 0)
-		prog_write = 1;
-	prog_base = (__int64)(source_num + prog_write) * block_lost;	// ブロックの合計掛け算個数 + 書き込み回数
+	prog_read = (block_lost + 31) / 32;	// 読み書きの経過をそれぞれ 3% ぐらいにする
+	prog_write = (source_num + 31) / 32;
+	prog_base = (__int64)(source_num + prog_write) * block_lost + prog_read * source_num;	// ブロックの合計掛け算個数 + 読み書き回数
 	len = try_cache_blocking(unit_size);
 	chunk_num = (unit_size + len - 1) / len;
+	cpu_num1 = 0;	// 読み込み中はスレッド数を減らす（シングル・スレッドの時は 0にする）
+	i = 1;
+	while (i * 2 <= cpu_num){	// 1=0, 2~3=1, 4~7=2, 8~15=3, 16~31=4, 32=5
+		cpu_num1++;
+		i *= 2;
+	}
+	if (cpu_num1 > block_lost)
+		cpu_num1 = block_lost;
+	src_max = cpu_cache & 0xFFFE;	// CPU cache 最適化のため、同時に処理するブロック数を制限する
+	if ((src_max < 8) || (cpu_num == 1))
+		src_max = 0x8000;	// 不明または少な過ぎる場合は、制限しない
 #ifdef TIMER
 	printf("\n read some blocks, and keep all recovering blocks\n");
 	printf("buffer size = %I64d MB, read_num = %d, round = %d\n", file_off >> 20, read_num, (source_num + read_num - 1) / read_num);
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, len, chunk_num);
-	printf("prog_base = %I64d, unit_size = %d\n", prog_base, unit_size);
+	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_flag & 0x7FFF0000, len, chunk_num);
+	printf("unit_size = %d, cpu_num1 = %d, src_max = %d\n", unit_size, cpu_num1, src_max);
 #endif
 
 	// マルチ・スレッドの準備をする
 	th->buf = p_buf;
 	th->size = unit_size;
 	th->count = block_lost;
-	th->off = len;	// キャッシュの最適化を試みる
+	th->len = len;	// キャッシュの最適化を試みる
 	for (j = 0; j < cpu_num; j++){	// サブ・スレッドごとに
 		hRun[j] = CreateEvent(NULL, FALSE, FALSE, NULL);	// Auto Reset にする
 		if (hRun[j] == NULL){
@@ -1098,8 +1074,6 @@ int decode_method3(	// 復元するブロックを全て保持できる場合
 		}
 		WaitForSingleObject(hEnd[j], INFINITE);	// 設定終了の合図を待つ (リセットしない)
 	}
-	// IO が延滞しないように、サブ・スレッド一つの優先度を下げる
-	SetThreadPriority(hSub[0], THREAD_PRIORITY_BELOW_NORMAL);
 
 	// 何回かに別けてブロックを読み込んで、消失ブロックを少しずつ復元する
 	print_progress_text(0, "Recovering slice");
@@ -1111,7 +1085,7 @@ int decode_method3(	// 復元するブロックを全て保持できる場合
 		if (read_num > source_num - source_off)
 			read_num = source_num - source_off;
 		th->size = 0;	// 1st decode
-		th->off = source_off - 1;	// まだ計算して無い印
+		src_off = source_off - 1;	// まだ計算して無い印
 
 #ifdef TIMER
 read_count = 0;
@@ -1177,9 +1151,15 @@ read_count++;
 #endif
 			}
 
-			if (i + 1 < read_num){	// 最後のブロック以外なら
+			if (src_off < 0){
+				src_num = i + 1;	// 最後のブロックより前なら
+			} else {
+				src_num = i / (src_off + 1);	// だいたい何ブロック読むごとに計算が終わるか
+				src_num += i + 1;	// 次のブロック番号を足す
+			}
+			if (src_num < read_num){	// 読み込みが終わる前に計算が終わりそうなら
 				// サブ・スレッドの動作状況を調べる
-				j = WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, 0);
+				j = WaitForMultipleObjects(cpu_num1, hEnd, TRUE, 0);
 				if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
 					// 経過表示
 					prog_num += block_lost;
@@ -1191,17 +1171,28 @@ read_count++;
 						time_last = GetTickCount();
 					}
 					// 計算終了したブロックの次から計算を開始する
-					th->off += 1;
-					th->buf = buf + (size_t)unit_size * (th->off - source_off);
-					th->mat = mat + th->off;
+					src_off += 1;
+					th->buf = buf + (size_t)unit_size * (src_off - source_off);
+					th->mat = mat + src_off;
+					th->off = src_off;
 					th->now = -1;	// 初期値 - 1
 					//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
-					for (j = 0; j < (cpu_num + 1) / 2; j++){
+					for (j = 0; j < cpu_num1; j++){
 						ResetEvent(hEnd[j]);	// リセットしておく
 						SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 					}
 				}
 			}
+
+			// 経過表示
+			prog_num += prog_read;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
 		}
 		if (hFile){	// 最後の読み込みファイルを閉じる
 			CloseHandle(hFile);
@@ -1211,58 +1202,66 @@ read_count++;
 time_read += GetTickCount() - time_start;
 #endif
 
-		WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
-		th->off += 1;	// 計算を開始するソース・ブロックの番号
-		if (th->off == 0)	// エラーや実験時以外は th->off は 0 にならない
+		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
+		src_off += 1;	// 計算を開始するソース・ブロックの番号
+		if (src_off == 0)	// 1st decode しなかった場合（src_off = 0）は、消失ブロックをゼロ埋めする
 			memset(p_buf, 0, (size_t)unit_size * block_lost);
 #ifdef TIMER
-		j = (th->off - source_off) * 1000 / read_num;
-		printf("partial decode = %d (%d.%d%%), read = %d\n", th->off - source_off, j / 10, j % 10, read_count);
+		j = (src_off - source_off) * 1000 / read_num;
+		printf("partial decode = %d / %d (%d.%d%%), source_off = %d, read = %d\n", src_off - source_off, read_num, j / 10, j % 10, source_off, read_count);
 #endif
 		recv_now = -1;	// 消失ブロックの本来のソース番号
 		last_file = -1;
 
 		// スレッドごとに消失ブロックを計算する
-		src_num = read_num - (th->off - source_off);	// 一度に処理する量 (src_num > 0)
-		th->buf = buf + (size_t)unit_size * (th->off - source_off);
-		th->mat = mat + th->off;
-		// th->off はソース・ブロックの番号
-		th->size = src_num;
-		th->now = -1;	// 初期値 - 1
-		//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
-		for (j = 0; j < cpu_num; j++){
-			ResetEvent(hEnd[j]);	// リセットしておく
-			SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
-		}
+		src_num = src_max;	// 一度に処理するソース・ブロックの数を制限する
+		while (src_off < source_off + read_num){
+			// ソース・ブロックを何個ずつ処理するか
+			if (src_off + src_num * 2 - 1 >= source_off + read_num)
+				src_num = source_off + read_num - src_off;
+			//printf("src_off = %d, src_num = %d\n", src_off, src_num);
 
-		// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待つ
-		while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
-			// th-now が最高値なので、計算が終わってるのは th-now - cpu_num 個となる
-			j = th->now - cpu_num;
-			if (j < 0)
-				j = 0;
-			j /= chunk_num;	// chunk数で割ってブロック数にする
-			// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
-			if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
-				err = 2;
-				goto error_end;
+			th->buf = buf + (size_t)unit_size * (src_off - source_off);
+			th->mat = mat + src_off;
+			th->off = src_off;	// ソース・ブロックの開始番号
+			th->size = src_num;
+			th->now = -1;	// 初期値 - 1
+			//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
+			for (j = 0; j < cpu_num; j++){
+				ResetEvent(hEnd[j]);	// リセットしておく
+				SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 			}
-			time_last = GetTickCount();
-		}
 
-		// 経過表示
-		prog_num += src_num * block_lost;
-		if (GetTickCount() - time_last >= UPDATE_TIME){
-			if (print_progress((int)((prog_num * 1000) / prog_base))){
-				err = 2;
-				goto error_end;
+			// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待つ
+			while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
+				// th-now が最高値なので、計算が終わってるのは th-now + 1 - cpu_num 個となる
+				j = th->now + 1 - cpu_num;
+				if (j < 0)
+					j = 0;
+				j /= chunk_num;	// chunk数で割ってブロック数にする
+				// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
+				if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
 			}
-			time_last = GetTickCount();
+
+			// 経過表示
+			prog_num += src_num * block_lost;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
+
+			src_off += src_num;
 		}
 
 		source_off += read_num;
 	}
-	//printf("\nprog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 #ifdef TIMER
 time_start = GetTickCount();
@@ -1330,11 +1329,12 @@ time_write += GetTickCount() - time_start;
 	CloseHandle(hFile);
 	hFile = NULL;
 	print_progress_done();
-	//printf("prog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 #ifdef TIMER
 printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
 printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+if (prog_num != prog_base)
+	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
 
 error_end:
@@ -1366,10 +1366,10 @@ int decode_method4(	// 全てのブロックを断片的に保持する場合 (G
 {
 	unsigned char *buf = NULL, *p_buf, *work_buf, *hash;
 	unsigned short *id;
-	int err = 0, i, j, last_file, recv_now;
-	int cpu_num1, cover_max, cover_from, cover_num;
+	int err = 0, i, j, last_file, chunk_num, recv_now;
+	int cpu_num1, src_off, src_num, src_max, vram_max;
 	unsigned int io_size, unit_size, len, block_off;
-	unsigned int time_last, prog_write;
+	unsigned int time_last, prog_read, prog_write;
 	__int64 file_off, prog_num = 0, prog_base;
 	HANDLE hFile = NULL;
 	HANDLE hSub[MAX_CPU], hRun[MAX_CPU], hEnd[MAX_CPU];
@@ -1377,13 +1377,10 @@ int decode_method4(	// 全てのブロックを断片的に保持する場合 (G
 
 	memset(hSub, 0, sizeof(HANDLE) * MAX_CPU);
 	id = mat + (block_lost * source_num);	// 何番目の消失ソース・ブロックがどのパリティで代替されるか
-	cpu_num1 = cpu_num;	// 最後のスレッドを GPU 管理用にする
-	if (cpu_num == 1)
-		cpu_num1++;
 
-	// 作業バッファーを確保する（GPU の作業領域として2個の余裕を見ておく）
+	// 作業バッファーを確保する
 	// part_num を使わず、全てのブロックを保持する所がdecode_method2と異なることに注意！
-	io_size = get_io_size(source_num + block_lost + 2, NULL, 1, MEM_UNIT);
+	io_size = get_io_size(source_num + block_lost, NULL, 1, MEM_UNIT);
 	//io_size = (((io_size + 1) / 2 + HASH_SIZE + (MEM_UNIT - 1)) & ~(MEM_UNIT - 1)) - HASH_SIZE;	// 2分割の実験用
 	//io_size = (((io_size + 2) / 3 + HASH_SIZE + (MEM_UNIT - 1)) & ~(MEM_UNIT - 1)) - HASH_SIZE;	// 3分割の実験用
 	unit_size = io_size + HASH_SIZE;	// チェックサムの分だけ増やす
@@ -1397,19 +1394,32 @@ int decode_method4(	// 全てのブロックを断片的に保持する場合 (G
 	p_buf = buf + (size_t)unit_size * source_num;	// 復元したブロックを記録する領域
 	hash = p_buf + (size_t)unit_size * block_lost;
 	prog_base = (block_size + io_size - 1) / io_size;
-	prog_write = source_num >> 5;	// 計算で 97%、書き込みで 3% ぐらい
-	if (prog_write == 0)
-		prog_write = 1;
-	prog_base *= (__int64)(source_num + prog_write) * block_lost;	// 全体の断片の個数
+	prog_read = (block_lost + 31) / 32;	// 読み書きの経過をそれぞれ 3% ぐらいにする
+	prog_write = (source_num + 31) / 32;
+	prog_base *= (__int64)(source_num + prog_write) * block_lost + prog_read * source_num;	// 全体の断片の個数
+	len = try_cache_blocking(unit_size);
+	chunk_num = (unit_size + len - 1) / len;
+	cpu_num1 = 0;	// 読み込み中はスレッド数を減らす（シングル・スレッドの時は 0にする）
+	i = 1;
+	while (i * 2 <= cpu_num){	// 1=0, 2~3=1, 4~7=2, 8~15=3, 16~31=4, 32=5
+		cpu_num1++;
+		i *= 2;
+	}
+	if (cpu_num1 > block_lost)
+		cpu_num1 = block_lost;
+	src_max = cpu_cache & 0xFFFE;	// CPU cache 最適化のため、同時に処理するブロック数を制限する
+	if ((src_max < 8) || (cpu_num <= 2))
+		src_max = 0x8000;	// 不明または少な過ぎる場合は、制限しない
 #ifdef TIMER
 	printf("\n read all blocks, and keep all recovering blocks (GPU)\n");
 	printf("buffer size = %I64d MB, io_size = %d, split = %d\n", file_off >> 20, io_size, (block_size + io_size - 1) / io_size);
+	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_flag & 0x7FFF0000, len, chunk_num);
+	printf("unit_size = %d, cpu_num1 = %d, src_max = %d\n", unit_size, cpu_num1, src_max);
 #endif
 
 	// OpenCL の初期化
-	cover_max = source_num;
-	len = 0;
-	i = init_OpenCL(unit_size, &cover_max, &len);
+	vram_max = source_num;
+	i = init_OpenCL(unit_size, len, &vram_max);
 	if (i != 0){
 		if (i != 3)	// GPU が見つからなかった場合はエラー表示しない
 			printf("init_OpenCL, %d, %d\n", i & 0xFF, i >> 8);
@@ -1421,19 +1431,16 @@ int decode_method4(	// 全てのブロックを断片的に保持する場合 (G
 		err = -2;	// CPU だけの方式に切り替える
 		goto error_end;
 	}
-	if (len == 0)	// GPUがキャッシュを使わない時だけ、CPU独自にキャッシュの最適化を試みる
-		len = try_cache_blocking(unit_size);
 #ifdef TIMER
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, len, (unit_size + len - 1) / len);
-	printf("prog_base = %I64d, unit_size = %d, method = %d, cover_max = %d\n", prog_base, unit_size, OpenCL_method, cover_max);
+	printf("OpenCL_method = %d, vram_max = %d\n", OpenCL_method, vram_max);
 #endif
 
 	// マルチ・スレッドの準備をする
 	th->buf = p_buf;
 	th->size = unit_size;
 	th->count = block_lost;
-	th->off = len;	// chunk size
-	for (j = 0; j < cpu_num1; j++){	// サブ・スレッドごとに
+	th->len = len;	// chunk size
+	for (j = 0; j < cpu_num; j++){	// サブ・スレッドごとに
 		hRun[j] = CreateEvent(NULL, FALSE, FALSE, NULL);	// Auto Reset にする
 		if (hRun[j] == NULL){
 			print_win32_err();
@@ -1452,12 +1459,11 @@ int decode_method4(	// 全てのブロックを断片的に保持する場合 (G
 		// サブ・スレッドを起動する
 		th->run = hRun[j];
 		th->end = hEnd[j];
-		th->now = j;	// スレッド番号
 		//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを起動する
-		if ((j == cpu_num1 - 1) && (OpenCL_method != 0)){	// 最後のスレッドを GPU 管理用にする
+		if ((j == cpu_num - 1) && (OpenCL_method != 0)){	// 最後のスレッドを GPU 管理用にする
 			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_decode_gpu, (LPVOID)th, 0, NULL);
 		} else {
-			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_decode_each, (LPVOID)th, 0, NULL);
+			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_decode3, (LPVOID)th, 0, NULL);
 		}
 		if (hSub[j] == NULL){
 			print_win32_err();
@@ -1469,8 +1475,7 @@ int decode_method4(	// 全てのブロックを断片的に保持する場合 (G
 		}
 		WaitForSingleObject(hEnd[j], INFINITE);	// 設定終了の合図を待つ (リセットしない)
 	}
-	// IO が延滞しないように、サブ・スレッド一つの優先度を下げる
-	SetThreadPriority(hSub[0], THREAD_PRIORITY_BELOW_NORMAL);
+	th->len = 0;	// GPUのエラー通知用にする
 
 	// ブロック断片を読み込んで、消失ブロック断片を復元する
 	print_progress_text(0, "Recovering slice");
@@ -1478,8 +1483,8 @@ int decode_method4(	// 全てのブロックを断片的に保持する場合 (G
 	wcscpy(file_path, base_dir);
 	block_off = 0;
 	while (block_off < block_size){
-		th->size = 0xFFFFFFFF;	// 1st decode
-		th->off = -1;	// まだ計算して無い印
+		th->size = 0;	// 1st decode
+		src_off = -1;	// まだ計算して無い印
 
 #ifdef TIMER
 read_count = 0;
@@ -1561,41 +1566,60 @@ read_count++;
 				}
 			}
 
-			if ((len > 0) && (i + 1 < source_num)){	// 最後のブロック以外なら
-				// サブ・スレッドの動作状況を調べる
-				j = WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, 0);
-				if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
-					// 経過表示
-					prog_num += block_lost;
-					if (GetTickCount() - time_last >= UPDATE_TIME){
-						if (print_progress((int)((prog_num * 1000) / prog_base))){
-							err = 2;
-							goto error_end;
+			if (len > 0){
+				if (src_off < 0){
+					src_num = i + 1;	// 最後のブロックより前なら
+				} else {
+					src_num = i / (src_off + 1);	// だいたい何ブロック読むごとに計算が終わるか
+					src_num += i + 1;	// 次のブロック番号を足す
+				}
+				if (src_num < source_num){	// 読み込みが終わる前に計算が終わりそうなら
+					// サブ・スレッドの動作状況を調べる
+					j = WaitForMultipleObjects(cpu_num1, hEnd, TRUE, 0);
+					if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
+						// 経過表示
+						prog_num += block_lost;
+						if (GetTickCount() - time_last >= UPDATE_TIME){
+							if (print_progress((int)((prog_num * 1000) / prog_base))){
+								err = 2;
+								goto error_end;
+							}
+							time_last = GetTickCount();
 						}
-						time_last = GetTickCount();
-					}
-					// 計算終了したブロックの次から計算を開始する
-					th->off += 1;
-					if (th->off > 0){	// バッファーに読み込んだ時だけ計算する
-						while ((s_blk[th->off].exist != 0) &&
-								((s_blk[th->off].size <= block_off) || (s_blk[th->off].exist == 3))){
-							prog_num += block_lost;
-							th->off += 1;
+						// 計算終了したブロックの次から計算を開始する
+						src_off += 1;
+						if (src_off > 0){	// バッファーに読み込んだ時だけ計算する
+							while ((s_blk[src_off].exist != 0) &&
+									((s_blk[src_off].size <= block_off) || (s_blk[src_off].exist == 3))){
+								prog_num += block_lost;
+								src_off += 1;
 #ifdef TIMER
 skip_count++;
 #endif
+							}
+						}
+						th->buf = buf + (size_t)unit_size * src_off;
+						th->mat = mat + src_off;
+						th->off = src_off;
+						th->now = -1;	// 初期値 - 1
+						//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
+						for (j = 0; j < cpu_num1; j++){
+							ResetEvent(hEnd[j]);	// リセットしておく
+							SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 						}
 					}
-					th->buf = buf + (size_t)unit_size * th->off;
-					th->mat = mat + th->off;
-					th->now = -1;	// 初期値 - 1
-					//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
-					for (j = 0; j < (cpu_num + 1) / 2; j++){
-						ResetEvent(hEnd[j]);	// リセットしておく
-						SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
-					}
 				}
 			}
+
+			// 経過表示
+			prog_num += prog_read;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
 		}
 		if (hFile){	// 最後の読み込みファイルを閉じる
 			CloseHandle(hFile);
@@ -1605,72 +1629,90 @@ skip_count++;
 time_read += GetTickCount() - time_start;
 #endif
 
-		WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
-		th->size = 0;	// 2nd decode
-		th->off += 1;	// 計算を開始するソース・ブロックの番号
-		if (th->off > 0){	// 計算不要なソース・ブロックはとばす
-			while ((s_blk[th->off].exist != 0) &&
-					((s_blk[th->off].size <= block_off) || (s_blk[th->off].exist == 3))){
+		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
+		src_off += 1;	// 計算を開始するソース・ブロックの番号
+		if (src_off > 0){	// 計算不要なソース・ブロックはとばす
+			while ((s_blk[src_off].exist != 0) &&
+					((s_blk[src_off].size <= block_off) || (s_blk[src_off].exist == 3))){
 				prog_num += block_lost;
-				th->off += 1;
+				src_off += 1;
 #ifdef TIMER
 skip_count++;
 #endif
 			}
-		} else {	// エラーや実験時以外は th->off は 0 にならない
+		} else {	// 1st decode しなかった場合（src_off = 0）は、消失ブロックをゼロ埋めする
 			memset(p_buf, 0, (size_t)unit_size * block_lost);
 		}
 #ifdef TIMER
-		j = (th->off * 1000) / source_num;
-		printf("partial decode = %d (%d.%d%%), read = %d, skip = %d\n", th->off, j / 10, j % 10, read_count, skip_count);
+		j = (src_off * 1000) / source_num;
+		printf("partial decode = %d / %d (%d.%d%%), read = %d, skip = %d\n", src_off, source_num, j / 10, j % 10, read_count, skip_count);
 #endif
 		recv_now = -1;	// 消失ブロックの本来のソース番号
 		last_file = -1;
 
-		// VRAM のサイズに応じて分割する
-		cover_from = th->off;
-		i = (source_num - cover_from + cover_max - 1) / cover_max;	// 何回に分けて処理するか
-		cover_num = (source_num - cover_from + i - 1) / i;	// 一度に処理する量を平均化する
-		while (cover_from < source_num){
+		// GPU と CPU のどちらに最適化するかが難しい
+		src_num = src_max;	// 一度に処理するソース・ブロックの数を制限する
+		if (src_num > vram_max){	// VRAM に収まらない場合は、VRAM のサイズに応じて分割する
+			src_num = vram_max & ~1;	// 減らして偶数にする（元が奇数なら分割数が増えるかも）
+			i = (source_num - src_off + src_num - 1) / src_num;	// 何回に分けて処理するか
+			src_num = (source_num - src_off + i - 1) / i;	// 一度に処理する量を平均化する
+			src_num = (src_num + 1) & ~1;	// 増やして偶数にする
+		}
+#ifdef TIMER
+		printf("remain = %d, src_off = %d, src_num = %d\n", source_num - src_off, src_off, src_num);
+#endif
+		while (src_off < source_num){
 			// ソース・ブロックを何個ずつ処理するか
-			if (cover_from + cover_num > source_num)
-				cover_num = source_num - cover_from;
-			//printf("cover_from = %d, cover_num = %d\n", cover_from, cover_num);
+			if (src_off + src_num > source_num){
+				src_num = source_num - src_off;
+#ifdef TIMER
+				printf("last1: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+			} else if (src_off + src_num * 2 - 1 >= source_num){
+				src_num = source_num - src_off;
+				if (src_num > vram_max){	// VRAM のサイズまでにする
+					src_num = (src_num + 1) / 2;	// 半分にする
+					src_num = (src_num + 1) & ~1;	// 偶数にする
+				}
+#ifdef TIMER
+				printf("last2: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+			}
 
 			// GPU と CPU がスレッドごとに消失ブロックを計算する
-			th->buf = buf + (size_t)unit_size * cover_from;
-			th->mat = mat + cover_from;
-			th->off = cover_from;
-			th->count = cover_num;
+			th->buf = buf + (size_t)unit_size * src_off;
+			th->mat = mat + src_off;
+			th->size = src_num;
 			th->now = -1;	// 初期値 - 1
 			//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
-			for (j = 0; j < cpu_num1; j++){
+			for (j = 0; j < cpu_num; j++){
 				ResetEvent(hEnd[j]);	// リセットしておく
 				SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 			}
 
 			// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待つ
-			while (WaitForMultipleObjects(cpu_num1, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
-				// th-now が最高値なので、計算が終わってるのは th-now - cpu_num1 個となる
-				j = th->now - cpu_num1;
+			while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
+				// th-now が最高値なので、計算が終わってるのは th-now + 1 - cpu_num 個となる
+				j = th->now + 1 - cpu_num;
 				if (j < 0)
 					j = 0;
+				j /= chunk_num;	// chunk数で割ってブロック数にする
 				// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
-				if (print_progress((int)(((prog_num + cover_num * j) * 1000) / prog_base))){
+				if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
 					err = 2;
 					goto error_end;
 				}
 				time_last = GetTickCount();
 			}
-			if (th->size != 0){	// エラー発生
-				i = th->size;
+			if (th->len != 0){	// エラー発生
+				i = th->len;
 				printf("error, gpu-thread, %d, %d\n", i & 0xFF, i >> 8);
 				err = 1;
 				goto error_end;
 			}
 
 			// 経過表示
-			prog_num += cover_num * block_lost;
+			prog_num += src_num * block_lost;
 			if (GetTickCount() - time_last >= UPDATE_TIME){
 				if (print_progress((int)((prog_num * 1000) / prog_base))){
 					err = 2;
@@ -1679,7 +1721,7 @@ skip_count++;
 				time_last = GetTickCount();
 			}
 
-			cover_from += cover_num;
+			src_off += src_num;
 		}
 
 #ifdef TIMER
@@ -1763,18 +1805,19 @@ time_write += GetTickCount() - time_start;
 		hFile = NULL;
 	}
 	print_progress_done();
-	//printf("prog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 #ifdef TIMER
 printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
 j = ((block_size + io_size - 1) / io_size) * block_lost;
 printf("write  %d.%03d sec, count = %d/%d\n", time_write / 1000, time_write % 1000, write_count, j);
+if (prog_num != prog_base)
+	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
 	info_OpenCL(buf, MEM_UNIT);	// デバイス情報を表示する
 
 error_end:
 	InterlockedExchange(&(th->now), INT_MAX / 2);	// サブ・スレッドの計算を中断する
-	for (j = 0; j < cpu_num1; j++){
+	for (j = 0; j < cpu_num; j++){
 		if (hSub[j]){	// サブ・スレッドを終了させる
 			SetEvent(hRun[j]);
 			WaitForSingleObject(hSub[j], INFINITE);
@@ -1802,10 +1845,11 @@ int decode_method5(	// 復元するブロックだけ保持する場合 (GPU対
 {
 	unsigned char *buf = NULL, *p_buf, *work_buf, *hash;
 	unsigned short *id;
-	int err = 0, i, j, last_file, source_off, read_num, recv_now, parity_now;
-	int cpu_num1, cover_max, cover_from, cover_num;
+	int err = 0, i, j, last_file, chunk_num, recv_now;
+	int source_off, read_num, parity_now;
+	int cpu_num1, src_off, src_num, src_max, vram_max;
 	unsigned int unit_size, len;
-	unsigned int time_last, prog_write;
+	unsigned int time_last, prog_read, prog_write;
 	__int64 file_off, prog_num = 0, prog_base;
 	HANDLE hFile = NULL;
 	HANDLE hSub[MAX_CPU], hRun[MAX_CPU], hEnd[MAX_CPU];
@@ -1814,12 +1858,9 @@ int decode_method5(	// 復元するブロックだけ保持する場合 (GPU対
 	memset(hSub, 0, sizeof(HANDLE) * MAX_CPU);
 	id = mat + (block_lost * source_num);	// 何番目の消失ソース・ブロックがどのパリティで代替されるか
 	unit_size = (block_size + HASH_SIZE + (MEM_UNIT - 1)) & ~(MEM_UNIT - 1);	// MEM_UNIT の倍数にする
-	cpu_num1 = cpu_num;	// 最後のスレッドを GPU 管理用にする
-	if (cpu_num == 1)
-		cpu_num1++;
 
-	// 作業バッファーを確保する（GPU の作業領域として2個の余裕を見ておく）
-	read_num = read_block_num(block_lost, 2, 1, MEM_UNIT);	// ソース・ブロックを何個読み込むか
+	// 作業バッファーを確保する
+	read_num = read_block_num(block_lost, 1, MEM_UNIT);	// ソース・ブロックを何個読み込むか
 	if (read_num == 0){
 		//printf("cannot keep enough blocks, use another method\n");
 		return -4;	// スライスを分割して処理しないと無理
@@ -1835,19 +1876,32 @@ int decode_method5(	// 復元するブロックだけ保持する場合 (GPU対
 	}
 	p_buf = buf + (size_t)unit_size * read_num;	// パリティ・ブロックを記録する領域
 	hash = p_buf + (size_t)unit_size * block_lost;
-	prog_write = source_num >> 5;	// 計算で 97%、書き込みで 3% ぐらい
-	if (prog_write == 0)
-		prog_write = 1;
-	prog_base = (__int64)(source_num + prog_write) * block_lost;	// ブロックの合計掛け算個数 + 書き込み回数
+	prog_read = (block_lost + 31) / 32;	// 読み書きの経過をそれぞれ 3% ぐらいにする
+	prog_write = (source_num + 31) / 32;
+	prog_base = (__int64)(source_num + prog_write) * block_lost + prog_read * source_num;	// ブロックの合計掛け算個数 + 書き込み回数
+	len = try_cache_blocking(unit_size);
+	chunk_num = (unit_size + len - 1) / len;
+	cpu_num1 = 0;	// 読み込み中はスレッド数を減らす（シングル・スレッドの時は 0にする）
+	i = 1;
+	while (i * 2 <= cpu_num){	// 1=0, 2~3=1, 4~7=2, 8~15=3, 16~31=4, 32=5
+		cpu_num1++;
+		i *= 2;
+	}
+	if (cpu_num1 > block_lost)
+		cpu_num1 = block_lost;
+	src_max = cpu_cache & 0xFFFE;	// CPU cache 最適化のため、同時に処理するブロック数を制限する
+	if ((src_max < 8) || (cpu_num <= 2))
+		src_max = 0x8000;	// 不明または少な過ぎる場合は、制限しない
 #ifdef TIMER
 	printf("\n read some blocks, and keep all recovering blocks (GPU)\n");
 	printf("buffer size = %I64d MB, read_num = %d, round = %d\n", file_off >> 20, read_num, (source_num + read_num - 1) / read_num);
+	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_flag & 0x7FFF0000, len, chunk_num);
+	printf("unit_size = %d, cpu_num1 = %d, src_max = %d\n", unit_size, cpu_num1, src_max);
 #endif
 
 	// OpenCL の初期化
-	cover_max = read_num;	// 読み込める分だけにする
-	len = 0;
-	i = init_OpenCL(unit_size, &cover_max, &len);
+	vram_max = read_num;	// 読み込める分だけにする
+	i = init_OpenCL(unit_size, len, &vram_max);
 	if (i != 0){
 		if (i != 3)	// GPU が見つからなかった場合はエラー表示しない
 			printf("init_OpenCL, %d, %d\n", i & 0xFF, i >> 8);
@@ -1859,19 +1913,16 @@ int decode_method5(	// 復元するブロックだけ保持する場合 (GPU対
 		err = -3;	// CPU だけの方式に切り替える
 		goto error_end;
 	}
-	if (len == 0)	// GPUがキャッシュを使わない時だけ、CPU独自にキャッシュの最適化を試みる
-		len = try_cache_blocking(unit_size);
 #ifdef TIMER
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, len, (unit_size + len - 1) / len);
-	printf("prog_base = %I64d, unit_size = %d, method = %d, cover_max = %d\n", prog_base, unit_size, OpenCL_method, cover_max);
+	printf("OpenCL_method = %d, vram_max = %d\n", OpenCL_method, vram_max);
 #endif
 
 	// マルチ・スレッドの準備をする
 	th->buf = p_buf;
 	th->size = unit_size;
 	th->count = block_lost;
-	th->off = len;	// chunk size
-	for (j = 0; j < cpu_num1; j++){	// サブ・スレッドごとに
+	th->len = len;	// chunk size
+	for (j = 0; j < cpu_num; j++){	// サブ・スレッドごとに
 		hRun[j] = CreateEvent(NULL, FALSE, FALSE, NULL);	// Auto Reset にする
 		if (hRun[j] == NULL){
 			print_win32_err();
@@ -1890,12 +1941,11 @@ int decode_method5(	// 復元するブロックだけ保持する場合 (GPU対
 		// サブ・スレッドを起動する
 		th->run = hRun[j];
 		th->end = hEnd[j];
-		th->now = j;	// スレッド番号
 		//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを起動する
-		if ((j == cpu_num1 - 1) && (OpenCL_method != 0)){	// 最後のスレッドを GPU 管理用にする
+		if ((j == cpu_num - 1) && (OpenCL_method != 0)){	// 最後のスレッドを GPU 管理用にする
 			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_decode_gpu, (LPVOID)th, 0, NULL);
 		} else {
-			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_decode_each, (LPVOID)th, 0, NULL);
+			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_decode3, (LPVOID)th, 0, NULL);
 		}
 		if (hSub[j] == NULL){
 			print_win32_err();
@@ -1907,8 +1957,7 @@ int decode_method5(	// 復元するブロックだけ保持する場合 (GPU対
 		}
 		WaitForSingleObject(hEnd[j], INFINITE);	// 設定終了の合図を待つ (リセットしない)
 	}
-	// IO が延滞しないように、サブ・スレッド一つの優先度を下げる
-	SetThreadPriority(hSub[0], THREAD_PRIORITY_BELOW_NORMAL);
+	th->len = 0;	// GPUのエラー通知用にする
 
 	// 何回かに別けてブロックを読み込んで、消失ブロックを少しずつ復元する
 	print_progress_text(0, "Recovering slice");
@@ -1919,8 +1968,8 @@ int decode_method5(	// 復元するブロックだけ保持する場合 (GPU対
 	while (source_off < source_num){
 		if (read_num > source_num - source_off)
 			read_num = source_num - source_off;
-		th->size = 0xFFFFFFFF;	// 1st decode
-		th->off = source_off - 1;	// まだ計算して無い印
+		th->size = 0;	// 1st decode
+		src_off = source_off - 1;	// まだ計算して無い印
 
 #ifdef TIMER
 read_count = 0;
@@ -1986,9 +2035,15 @@ read_count++;
 #endif
 			}
 
-			if (i + 1 < read_num){	// 最後のブロック以外なら
+			if (src_off < 0){
+				src_num = i + 1;	// 最後のブロックより前なら
+			} else {
+				src_num = i / (src_off + 1);	// だいたい何ブロック読むごとに計算が終わるか
+				src_num += i + 1;	// 次のブロック番号を足す
+			}
+			if (src_num < read_num){	// 読み込みが終わる前に計算が終わりそうなら
 				// サブ・スレッドの動作状況を調べる
-				j = WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, 0);
+				j = WaitForMultipleObjects(cpu_num1, hEnd, TRUE, 0);
 				if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
 					// 経過表示
 					prog_num += block_lost;
@@ -2000,17 +2055,28 @@ read_count++;
 						time_last = GetTickCount();
 					}
 					// 計算終了したブロックの次から計算を開始する
-					th->off += 1;
-					th->buf = buf + (size_t)unit_size * (th->off - source_off);
-					th->mat = mat + th->off;
+					src_off += 1;
+					th->buf = buf + (size_t)unit_size * (src_off - source_off);
+					th->mat = mat + src_off;
+					th->off = src_off;
 					th->now = -1;	// 初期値 - 1
 					//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
-					for (j = 0; j < (cpu_num + 1) / 2; j++){
+					for (j = 0; j < cpu_num1; j++){
 						ResetEvent(hEnd[j]);	// リセットしておく
 						SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 					}
 				}
 			}
+
+			// 経過表示
+			prog_num += prog_read;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
 		}
 		if (hFile){	// 最後の読み込みファイルを閉じる
 			CloseHandle(hFile);
@@ -2020,62 +2086,81 @@ read_count++;
 time_read += GetTickCount() - time_start;
 #endif
 
-		WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
-		th->size = 0;	// 2nd decode
-		th->off += 1;	// 計算を開始するソース・ブロックの番号
-		if (th->off == 0)	// エラーや実験時以外は th->off は 0 にならない
+		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
+		src_off += 1;	// 計算を開始するソース・ブロックの番号
+		if (src_off == 0)	// 1st decode しなかった場合（src_off = 0）は、消失ブロックをゼロ埋めする
 			memset(p_buf, 0, (size_t)unit_size * block_lost);
 #ifdef TIMER
-		j = (th->off - source_off) * 1000 / read_num;
-		printf("partial decode = %d (%d.%d%%), read = %d\n", th->off - source_off, j / 10, j % 10, read_count);
+		j = (src_off - source_off) * 1000 / read_num;
+		printf("partial decode = %d / %d (%d.%d%%), source_off = %d, read = %d\n", src_off - source_off, read_num, j / 10, j % 10, source_off, read_count);
 #endif
 		recv_now = -1;	// 消失ブロックの本来のソース番号
 		last_file = -1;
 
-		// VRAM のサイズに応じて分割する
-		cover_from = th->off - source_off;
-		i = (read_num - cover_from + cover_max - 1) / cover_max;	// 何回に分けて処理するか
-		cover_num = (read_num - cover_from + i - 1) / i;	// 一度に処理する量を平均化する
-		while (cover_from < read_num){
+		// GPU と CPU のどちらに最適化するかが難しい
+		src_off -= source_off;	// バッファー内でのソース・ブロックの位置にする
+		src_num = src_max;	// 一度に処理するソース・ブロックの数を制限する
+		if (src_num > vram_max){	// VRAM に収まらない場合は、VRAM のサイズに応じて分割する
+			src_num = vram_max & ~1;	// 減らして偶数にする（元が奇数なら分割数が増えるかも）
+			i = (read_num - src_off + src_num - 1) / src_num;	// 何回に分けて処理するか
+			src_num = (read_num - src_off + i - 1) / i;	// 一度に処理する量を平均化する
+			src_num = (src_num + 1) & ~1;	// 増やして偶数にする
+		}
+#ifdef TIMER
+		printf("remain = %d, src_off = %d, src_num = %d\n", read_num - src_off, src_off, src_num);
+#endif
+		while (src_off < read_num){
 			// ソース・ブロックを何個ずつ処理するか
-			if (cover_from + cover_num > read_num)
-				cover_num = read_num - cover_from;
-			//printf("cover_from = %d, cover_num = %d\n", cover_from, cover_num);
+			if (src_off + src_num > read_num){
+				src_num = read_num - src_off;
+#ifdef TIMER
+				printf("last1: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+			} else if (src_off + src_num * 2 - 1 >= read_num){
+				src_num = read_num - src_off;
+				if (src_num > vram_max){	// VRAM のサイズまでにする
+					src_num = (src_num + 1) / 2;	// 半分にする
+					src_num = (src_num + 1) & ~1;	// 偶数にする
+				}
+#ifdef TIMER
+				printf("last2: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+			}
 
 			// GPU と CPU がスレッドごとに消失ブロックを計算する
-			th->buf = buf + (size_t)unit_size * cover_from;
-			th->mat = mat + (source_off + cover_from);
-			th->off = source_off + cover_from;	// ソース・ブロックの番号にする
-			th->count = cover_num;
+			th->buf = buf + (size_t)unit_size * src_off;
+			th->mat = mat + (source_off + src_off);	// ソース・ブロックの番号にする
+			th->size = src_num;
 			th->now = -1;	// 初期値 - 1
 			//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを再開する
-			for (j = 0; j < cpu_num1; j++){
+			for (j = 0; j < cpu_num; j++){
 				ResetEvent(hEnd[j]);	// リセットしておく
 				SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 			}
 
 			// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待つ
-			while (WaitForMultipleObjects(cpu_num1, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
-				// th-now が最高値なので、計算が終わってるのは th-now - cpu_num1 個となる
-				j = th->now - cpu_num1;
+			while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
+				// th-now が最高値なので、計算が終わってるのは th-now + 1 - cpu_num 個となる
+				j = th->now + 1 - cpu_num;
 				if (j < 0)
 					j = 0;
+				j /= chunk_num;	// chunk数で割ってブロック数にする
 				// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
-				if (print_progress((int)(((prog_num + cover_num * j) * 1000) / prog_base))){
+				if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
 					err = 2;
 					goto error_end;
 				}
 				time_last = GetTickCount();
 			}
-			if (th->size != 0){	// エラー発生
-				i = th->size;
+			if (th->len != 0){	// エラー発生
+				i = th->len;
 				printf("error, gpu-thread, %d, %d\n", i & 0xFF, i >> 8);
 				err = 1;
 				goto error_end;
 			}
 
 			// 経過表示
-			prog_num += cover_num * block_lost;
+			prog_num += src_num * block_lost;
 			if (GetTickCount() - time_last >= UPDATE_TIME){
 				if (print_progress((int)((prog_num * 1000) / prog_base))){
 					err = 2;
@@ -2084,12 +2169,11 @@ time_read += GetTickCount() - time_start;
 				time_last = GetTickCount();
 			}
 
-			cover_from += cover_num;
+			src_off += src_num;
 		}
 
 		source_off += read_num;
 	}
-	//printf("\nprog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 #ifdef TIMER
 time_start = GetTickCount();
@@ -2157,17 +2241,18 @@ time_write += GetTickCount() - time_start;
 	CloseHandle(hFile);
 	hFile = NULL;
 	print_progress_done();
-	//printf("prog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 #ifdef TIMER
 printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
 printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+if (prog_num != prog_base)
+	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
 	info_OpenCL(buf, MEM_UNIT);	// デバイス情報を表示する
 
 error_end:
 	InterlockedExchange(&(th->now), INT_MAX / 2);	// サブ・スレッドの計算を中断する
-	for (j = 0; j < cpu_num1; j++){
+	for (j = 0; j < cpu_num; j++){
 		if (hSub[j]){	// サブ・スレッドを終了させる
 			SetEvent(hRun[j]);
 			WaitForSingleObject(hSub[j], INFINITE);
diff --git a/source/par2j/rs_encode.c b/source/par2j/rs_encode.c
index bed1c0c..b538c89 100644
--- a/source/par2j/rs_encode.c
+++ b/source/par2j/rs_encode.c
@@ -1,5 +1,5 @@
 ﻿// rs_encode.c
-// Copyright : 2021-12-17 Yutaka Sawada
+// Copyright : 2023-09-21 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -40,6 +40,7 @@ typedef struct {	// RS threading control struct
 	unsigned short *mat;	// 行列
 	unsigned char * volatile buf;
 	volatile unsigned int size;		// バイト数
+	volatile unsigned int len;
 	volatile int count;
 	volatile int off;
 	volatile int now;
@@ -51,10 +52,10 @@ typedef struct {	// RS threading control struct
 static DWORD WINAPI thread_encode2(LPVOID lpParameter)
 {
 	unsigned char *s_buf, *p_buf, *work_buf;
-	unsigned short *constant, factor2;
-	volatile unsigned short *factor1;
-	int i, j, src_start, src_num, max_num, chunk_num;
-	int part_start, part_num, cover_num;
+	unsigned short *constant, factor, factor2;
+	int i, j, max_num, chunk_num;
+	int part_off, part_num, part_now;
+	int src_off, src_num;
 	unsigned int unit_size, len, off, chunk_size;
 	HANDLE hRun, hEnd;
 	RS_TH *th;
@@ -67,14 +68,13 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 	constant = th->mat;
 	p_buf = th->buf;
 	unit_size = th->size;
-	chunk_size = th->off;
+	chunk_size = th->len;
 	part_num = th->count;
 	hRun = th->run;
 	hEnd = th->end;
 	//_mm_sfence();
 	SetEvent(hEnd);	// 設定完了を通知する
 
-	factor1 = constant + source_num;
 	chunk_num = (unit_size + chunk_size - 1) / chunk_size;
 
 	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
@@ -83,48 +83,60 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 time_start2 = GetTickCount();
 #endif
 		s_buf = th->buf;
-		src_start = th->off;	// ソース・ブロック番号
-		len = chunk_size;
+		src_off = th->off;	// ソース・ブロック番号
 
 		if (th->size == 0){	// ソース・ブロック読み込み中
 			// パリティ・ブロックごとに掛け算して追加していく
-			max_num = chunk_num * part_num;
-			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
-				off = j / part_num;	// chunk の番号
-				j = j % part_num;	// parity の番号
-				off *= chunk_size;
-				if (off + len > unit_size)
-					len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
-				if (src_start == 0)	// 最初のブロックを計算する際に
-					memset(p_buf + ((size_t)unit_size * j + off), 0, len);	// ブロックを 0で埋める
-				galois_align_multiply(s_buf + off, p_buf + ((size_t)unit_size * j + off), len, factor1[j]);
+			while ((j = InterlockedIncrement(&(th->now))) < part_num){	// j = ++th_now
+				if (src_off == 0)	// 最初のブロックを計算する際に
+					memset(p_buf + (size_t)unit_size * j, 0, unit_size);	// ブロックを 0で埋める
+				factor = galois_power(constant[src_off], first_num + j);	// factor は定数行列の乗数になる
+				galois_align_multiply(s_buf, p_buf + (size_t)unit_size * j, unit_size, factor);
 #ifdef TIMER
 loop_count2a++;
 #endif
 			}
+
 #ifdef TIMER
 time_encode2a += GetTickCount() - time_start2;
 #endif
 		} else {	// パリティ・ブロックを部分的に保持する場合
 			// スレッドごとに作成するパリティ・ブロックの chunk を変える
-			src_num = source_num - src_start;
-			cover_num = th->size;
-			part_start = th->count;
-			max_num = chunk_num * cover_num;
+			src_num = th->len;
+			part_now = th->size;
+			part_off = th->count;
+			len = chunk_size;
+			max_num = chunk_num * part_now;
 			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
-				off = j / cover_num;	// chunk の番号
-				j = j % cover_num;		// parity の番号
-				off *= chunk_size;		// chunk の位置
+				off = j / part_now;	// chunk の番号
+				j = j % part_now;	// parity の番号
+				off *= chunk_size;	// chunk の位置
 				if (off + len > unit_size)
 					len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
 				work_buf = p_buf + (size_t)unit_size * j + off;
-				if (part_start != 0)
-					memset(work_buf, 0, len);	// 最初の part_num 以降は 2nd encode だけなので 0で埋める
+				if (src_off == 0)	// 最初のブロックを計算する際に
+					memset(work_buf, 0, len);	// パリティ・ブロックを 0で埋める
 
 				// ソース・ブロックごとにパリティを追加していく
-				for (i = 0; i < src_num; i++){
-					factor2 = galois_power(constant[src_start + i], first_num + part_start + j);	// factor は定数行列の乗数になる
-					galois_align_multiply(s_buf + ((size_t)unit_size * i + off), work_buf, len, factor2);
+				if (galois_align_multiply2 != NULL){	// ２ブロックずつ計算する場合 (SSSE3 か AVX2)
+					i = 0;
+					if (src_num & 1){	// 奇数なら最初の一個を計算して、残りを偶数に変える
+						factor = galois_power(constant[src_off + i], first_num + part_off + j);	// factor は定数行列の乗数になる
+						galois_align_multiply(s_buf + (size_t)unit_size * i + off, work_buf, len, factor);
+						i++;
+					}
+					for (; i < src_num; i += 2){
+						factor = galois_power(constant[src_off + i], first_num + part_off + j);	// 二つ連続で計算する
+						factor2 = galois_power(constant[src_off + i + 1], first_num + part_off + j);
+						galois_align_multiply2(s_buf + (size_t)unit_size * i + off, s_buf + (size_t)unit_size * (i + 1) + off,
+									work_buf, len, factor, factor2);
+					}
+
+				} else {	// 一つずつ計算する場合
+					for (i = 0; i < src_num; i++){
+						factor = galois_power(constant[src_off + i], first_num + part_off + j);	// factor は定数行列の乗数になる
+						galois_align_multiply(s_buf + (size_t)unit_size * i + off, work_buf, len, factor);
+					}
 				}
 #ifdef TIMER
 loop_count2b += src_num;
@@ -139,8 +151,7 @@ time_encode2b += GetTickCount() - time_start2;
 		WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	}
 #ifdef TIMER
-loop_count2a /= chunk_num;	// chunk数で割ってブロック数にする
-loop_count2b /= chunk_num;
+loop_count2b /= chunk_num;	// chunk数で割ってブロック数にする
 printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
 if (time_encode2a > 0){
 	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
@@ -166,9 +177,9 @@ printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time
 static DWORD WINAPI thread_encode3(LPVOID lpParameter)
 {
 	unsigned char *s_buf, *p_buf, *work_buf;
-	unsigned short *constant, factor2;
-	volatile unsigned short *factor1;
-	int i, j, src_start, src_num, max_num, chunk_num;
+	unsigned short *constant, factor, factor2;
+	int i, j, max_num, chunk_num;
+	int src_off, src_num;
 	unsigned int unit_size, len, off, chunk_size;
 	HANDLE hRun, hEnd;
 	RS_TH *th;
@@ -181,13 +192,12 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 	constant = th->mat;
 	p_buf = th->buf;
 	unit_size = th->size;
-	chunk_size = th->off;
+	chunk_size = th->len;
 	hRun = th->run;
 	hEnd = th->end;
 	//_mm_sfence();
 	SetEvent(hEnd);	// 設定完了を通知する
 
-	factor1 = constant + source_num;
 	chunk_num = (unit_size + chunk_size - 1) / chunk_size;
 	max_num = chunk_num * parity_num;
 
@@ -197,20 +207,15 @@ unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
 time_start2 = GetTickCount();
 #endif
 		s_buf = th->buf;
-		src_start = th->off;	// ソース・ブロック番号
-		len = chunk_size;
+		src_off = th->off;	// ソース・ブロック番号
 
 		if (th->size == 0){	// ソース・ブロック読み込み中
 			// パリティ・ブロックごとに掛け算して追加していく
-			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
-				off = j / parity_num;	// chunk の番号
-				j = j % parity_num;	// parity の番号
-				off *= chunk_size;
-				if (off + len > unit_size)
-					len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
-				if (src_start == 0)	// 最初のブロックを計算する際に
-					memset(p_buf + ((size_t)unit_size * j + off), 0, len);	// ブロックを 0で埋める
-				galois_align_multiply(s_buf + off, p_buf + ((size_t)unit_size * j + off), len, factor1[j]);
+			while ((j = InterlockedIncrement(&(th->now))) < parity_num){	// j = ++th_now
+				if (src_off == 0)	// 最初のブロックを計算する際に
+					memset(p_buf + (size_t)unit_size * j, 0, unit_size);	// ブロックを 0で埋める
+				factor = galois_power(constant[src_off], first_num + j);	// factor は定数行列の乗数になる
+				galois_align_multiply(s_buf, p_buf + (size_t)unit_size * j, unit_size, factor);
 #ifdef TIMER
 loop_count2a++;
 #endif
@@ -221,6 +226,7 @@ time_encode2a += GetTickCount() - time_start2;
 		} else {	// 全てのパリティ・ブロックを保持する場合
 			// スレッドごとに作成するパリティ・ブロックの chunk を変える
 			src_num = th->size;
+			len = chunk_size;
 			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
 				off = j / parity_num;	// chunk の番号
 				j = j % parity_num;		// parity の番号
@@ -230,9 +236,25 @@ time_encode2a += GetTickCount() - time_start2;
 				work_buf = p_buf + (size_t)unit_size * j + off;
 
 				// ソース・ブロックごとにパリティを追加していく
-				for (i = 0; i < src_num; i++){
-					factor2 = galois_power(constant[src_start + i], first_num + j);	// factor は定数行列の乗数になる
-					galois_align_multiply(s_buf + ((size_t)unit_size * i + off), work_buf, len, factor2);
+				if (galois_align_multiply2 != NULL){	// ２ブロックずつ計算する場合 (SSSE3 か AVX2)
+					i = 0;
+					if (src_num & 1){	// 奇数なら最初の一個を計算して、残りを偶数に変える
+						factor = galois_power(constant[src_off + i], first_num + j);	// factor は定数行列の乗数になる
+						galois_align_multiply(s_buf + (size_t)unit_size * i + off, work_buf, len, factor);
+						i++;
+					}
+					for (; i < src_num; i += 2){
+						factor = galois_power(constant[src_off + i], first_num + j);	// 二つ連続で計算する
+						factor2 = galois_power(constant[src_off + i + 1], first_num + j);
+						galois_align_multiply2(s_buf + (size_t)unit_size * i + off, s_buf + (size_t)unit_size * (i + 1) + off,
+									work_buf, len, factor, factor2);
+					}
+
+				} else {	// 一つずつ計算する場合
+					for (i = 0; i < src_num; i++){
+						factor = galois_power(constant[src_off + i], first_num + j);	// factor は定数行列の乗数になる
+						galois_align_multiply(s_buf + (size_t)unit_size * i + off, work_buf, len, factor);
+					}
 				}
 #ifdef TIMER
 loop_count2b += src_num;
@@ -247,8 +269,7 @@ time_encode2b += GetTickCount() - time_start2;
 		WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	}
 #ifdef TIMER
-loop_count2a /= chunk_num;	// chunk数で割ってブロック数にする
-loop_count2b /= chunk_num;
+loop_count2b /= chunk_num;	// chunk数で割ってブロック数にする
 printf("sub-thread : total loop = %d\n", loop_count2a + loop_count2b);
 if (time_encode2a > 0){
 	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
@@ -271,130 +292,14 @@ printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time
 	return 0;
 }
 
-// ブロックごとに計算するためのスレッド
-static DWORD WINAPI thread_encode_each(LPVOID lpParameter)
-{
-	unsigned char *s_buf, *p_buf, *work_buf;
-	unsigned short *constant, *factor2;
-	volatile unsigned short *factor1;
-	int i, j, th_id, src_start, src_num, max_num;
-	unsigned int unit_size, len, off, chunk_size;
-	HANDLE hRun, hEnd;
-	RS_TH *th;
-#ifdef TIMER
-unsigned int loop_count2a = 0, loop_count2b = 0;
-unsigned int time_start2, time_encode2a = 0, time_encode2b = 0;
-#endif
-
-	th = (RS_TH *)lpParameter;
-	constant = th->mat;
-	p_buf = th->buf;
-	unit_size = th->size;
-	th_id = th->now;	// スレッド番号
-	chunk_size = th->off;
-	factor2 = (unsigned short *)(p_buf + ((size_t)unit_size * parity_num + HASH_SIZE));
-	factor2 += th->count * th_id;	// スレッドごとに保存場所を変える
-	hRun = th->run;
-	hEnd = th->end;
-	//_mm_sfence();
-	SetEvent(hEnd);	// 設定完了を通知する
-
-	factor1 = constant + source_num;
-	max_num = ((unit_size + chunk_size - 1) / chunk_size) * parity_num;
-
-	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
-	while (th->now < INT_MAX / 2){
-#ifdef TIMER
-time_start2 = GetTickCount();
-#endif
-		s_buf = th->buf;
-		src_start = th->off;	// ソース・ブロック番号
-
-		if (th->size == 0xFFFFFFFF){	// ソース・ブロック読み込み中
-			len = chunk_size;
-			// パリティ・ブロックごとに掛け算して追加していく
-			while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
-				off = j / parity_num;	// chunk の番号
-				j = j % parity_num;		// parity の番号
-				off *= chunk_size;
-				if (off + len > unit_size)
-					len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
-				if (src_start == 0)	// 最初のブロックを計算する際に
-					memset(p_buf + ((size_t)unit_size * j + off), 0, len);	// ブロックを 0で埋める
-				galois_align_multiply(s_buf + off, p_buf + ((size_t)unit_size * j + off), len, factor1[j]);
-#ifdef TIMER
-loop_count2a++;
-#endif
-			}
-#ifdef TIMER
-time_encode2a += GetTickCount() - time_start2;
-#endif
-		} else {
-			// スレッドごとに作成するパリティ・ブロックを変える
-			src_num = th->count;
-			while ((j = InterlockedIncrement(&(th->now))) < parity_num){	// j = ++th_now
-				work_buf = p_buf + (size_t)unit_size * j;
-
-				// factor は定数行列の乗数になる
-				for (i = 0; i < src_num; i++)
-					factor2[i] = galois_power(constant[src_start + i], first_num + j);
-
-				// chunk に分割して計算する
-				len = chunk_size;
-				off = 0;
-				while (off < unit_size){
-					// ソース・ブロックごとにパリティを追加していく
-					for (i = 0; i < src_num; i++)
-						galois_align_multiply(s_buf + ((size_t)unit_size * i + off), work_buf, len, factor2[i]);
-
-					work_buf += len;
-					off += len;
-					if (off + len > unit_size)
-						len = unit_size - off;
-				}
-#ifdef TIMER
-loop_count2b += src_num;
-#endif
-			}
-#ifdef TIMER
-time_encode2b += GetTickCount() - time_start2;
-#endif
-		}
-		//_mm_sfence();	// メモリーへの書き込みを完了する
-		SetEvent(hEnd);	// 計算終了を通知する
-		WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
-	}
-#ifdef TIMER
-loop_count2a /= (unit_size + chunk_size - 1) / chunk_size;	// chunk数で割ってブロック数にする
-printf("sub-thread[%d] : total loop = %d\n", th_id, loop_count2a + loop_count2b);
-if (time_encode2a > 0){
-	i = (int)((__int64)loop_count2a * unit_size * 125 / ((__int64)time_encode2a * 131072));
-} else {
-	i = 0;
-}
-if (loop_count2a > 0)
-	printf(" 1st encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2a / 1000, time_encode2a % 1000, loop_count2a, i);
-if (time_encode2b > 0){
-	i = (int)((__int64)loop_count2b * unit_size * 125 / ((__int64)time_encode2b * 131072));
-} else {
-	i = 0;
-}
-printf(" 2nd encode %d.%03d sec, %d loop, %d MB/s\n", time_encode2b / 1000, time_encode2b % 1000, loop_count2b, i);
-#endif
-
-	// 終了処理
-	CloseHandle(hRun);
-	CloseHandle(hEnd);
-	return 0;
-}
-
-// GPU 対応のサブ・スレッド (スレッド番号は最後になる)
+// GPU 対応のサブ・スレッド (最後のスレッドなので、1st encode では呼ばれない)
 static DWORD WINAPI thread_encode_gpu(LPVOID lpParameter)
 {
 	unsigned char *s_buf, *p_buf;
-	unsigned short *constant, *factor2;
-	int i, j, th_id, src_start, src_num;
-	unsigned int unit_size;
+	unsigned short *constant, *factor;
+	int i, j, max_num, chunk_num;
+	int src_off, src_num;
+	unsigned int unit_size, len, off, chunk_size;
 	HANDLE hRun, hEnd;
 	RS_TH *th;
 #ifdef TIMER
@@ -405,14 +310,16 @@ unsigned int time_start2, time_encode2 = 0, loop_count2 = 0;
 	constant = th->mat;
 	p_buf = th->buf;
 	unit_size = th->size;
-	th_id = th->now;	// スレッド番号
-	factor2 = (unsigned short *)(p_buf + ((size_t)unit_size * parity_num + HASH_SIZE));
-	factor2 += th->count * th_id;	// スレッドごとに保存場所を変える
+	chunk_size = th->len;
 	hRun = th->run;
 	hEnd = th->end;
 	//_mm_sfence();
 	SetEvent(hEnd);	// 設定完了を通知する
 
+	factor = constant + source_num;
+	chunk_num = (unit_size + chunk_size - 1) / chunk_size;
+	max_num = chunk_num * parity_num;
+
 	WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	while (th->now < INT_MAX / 2){
 #ifdef TIMER
@@ -420,27 +327,37 @@ time_start2 = GetTickCount();
 #endif
 		// GPUはソース・ブロック読み込み中に呼ばれない
 		s_buf = th->buf;
-		src_start = th->off;	// ソース・ブロック番号
-		src_num = th->count;
+		src_off = th->off;	// ソース・ブロック番号
+		src_num = th->size;
 
 		// 最初にソース・ブロックをVRAMへ転送する
 		i = gpu_copy_blocks(s_buf, unit_size, src_num);
 		if (i != 0){
-			th->size = i;
-			InterlockedExchange(&(th->now), INT_MAX / 2);	// サブ・スレッドの計算を中断する
+			th->len = i;
+			InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
 		}
 
-		// スレッドごとに作成するパリティ・ブロックを変える
-		while ((j = InterlockedIncrement(&(th->now))) < parity_num){	// j = ++th_now
+		// スレッドごとに作成するパリティ・ブロックの chunk を変える
+		len = chunk_size;
+		while ((j = InterlockedIncrement(&(th->now))) < max_num){	// j = ++th_now
+			off = j / parity_num;	// chunk の番号
+			j = j % parity_num;		// parity の番号
+			off *= chunk_size;		// chunk の位置
+			if (off + len > unit_size)
+				len = unit_size - off;	// 最後の chunk だけサイズが異なるかも
+
 			// factor は定数行列の乗数になる
 			for (i = 0; i < src_num; i++)
-				factor2[i] = galois_power(constant[src_start + i], first_num + j);
+				factor[i] = galois_power(constant[src_off + i], first_num + j);
 
-			i = gpu_multiply_blocks(src_num, factor2, p_buf + (size_t)unit_size * j, unit_size);
+			// VRAM上のソース・ブロックごとにパリティを追加していく
+			i = gpu_multiply_chunks(src_num, factor, p_buf + (size_t)unit_size * j + off, off, len);
 			if (i != 0){
-				th->size = i;
+				th->len = i;
+				InterlockedExchange(&(th->now), INT_MAX / 3);	// サブ・スレッドの計算を中断する
 				break;
 			}
+
 #ifdef TIMER
 loop_count2 += src_num;
 #endif
@@ -449,14 +366,17 @@ loop_count2 += src_num;
 time_encode2 += GetTickCount() - time_start2;
 #endif
 		// 最後にVRAMを解放する
-		th->size = gpu_finish();
+		i = gpu_finish();
+		if ((i != 0) && (th->len == 0))
+			th->len = i;	// 初めてエラーが発生した時だけセットする
 
 		//_mm_sfence();	// メモリーへの書き込みを完了する
 		SetEvent(hEnd);	// 計算終了を通知する
 		WaitForSingleObject(hRun, INFINITE);	// 計算開始の合図を待つ
 	}
 #ifdef TIMER
-printf("gpu-thread    : total loop = %d\n", loop_count2);
+loop_count2 /= chunk_num;	// chunk数で割ってブロック数にする
+printf("gpu-thread :\n");
 if (time_encode2 > 0){
 	i = (int)((__int64)loop_count2 * unit_size * 125 / ((__int64)time_encode2 * 131072));
 } else {
@@ -507,7 +427,7 @@ int encode_method1(	// ソース・ブロックが一個だけの場合
 	printf("\n read one source block, and keep one parity block\n");
 	printf("buffer size = %d MB, io_size = %d, split = %d\n", len >> 20, io_size, (block_size + io_size - 1) / io_size);
 	j = try_cache_blocking(unit_size);
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, j, (unit_size + j - 1) / j);
+	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_flag & 0x7FFF0000, j, (unit_size + j - 1) / j);
 #endif
 
 	if (io_size < block_size){	// スライスが分割される場合だけ、途中までのハッシュ値を保持する
@@ -706,11 +626,11 @@ int encode_method2(	// ソース・データを全て読み込む場合
 	unsigned short *constant)	// 複数ブロック分の領域を確保しておく？
 {
 	unsigned char *buf = NULL, *p_buf, *work_buf, *hash;
-	unsigned short *factor1;
-	int err = 0, i, j, last_file, part_start, part_num;
-	int src_num, chunk_num, cover_num;
+	int err = 0, i, j, last_file, chunk_num;
+	int part_off, part_num, part_now;
+	int cpu_num1, src_off, src_num, src_max;
 	unsigned int io_size, unit_size, len, block_off;
-	unsigned int time_last, prog_write;
+	unsigned int time_last, prog_read, prog_write;
 	__int64 file_off, prog_num = 0, prog_base;
 	HANDLE hFile = NULL;
 	HANDLE hSub[MAX_CPU], hRun[MAX_CPU], hEnd[MAX_CPU];
@@ -718,19 +638,11 @@ int encode_method2(	// ソース・データを全て読み込む場合
 	PHMD5 md_ctx, *md_ptr = NULL;
 
 	memset(hSub, 0, sizeof(HANDLE) * MAX_CPU);
-	factor1 = constant + source_num;
 
 	// 作業バッファーを確保する
-	part_num = source_num >> PART_MAX_RATE;	// ソース・ブロック数に対する割合で最大量を決める
+	part_num = parity_num;	// 最大値を初期値にする
 	//part_num = (parity_num + 1) / 2;	// 確保量の実験用
 	//part_num = (parity_num + 2) / 3;	// 確保量の実験用
-	if (part_num < parity_num){	// 分割して計算するなら
-		i = (parity_num + part_num - 1) / part_num;	// 分割回数
-		part_num = (parity_num + i - 1) / i;
-		part_num = ((part_num + cpu_num - 1) / cpu_num) * cpu_num;	// cpu_num の倍数にする（切り上げ）
-	}
-	if (part_num > parity_num)
-		part_num = parity_num;
 	io_size = get_io_size(source_num, &part_num, 1, sse_unit);
 	//io_size = (((io_size + 1) / 2 + HASH_SIZE + (sse_unit - 1)) & ~(sse_unit - 1)) - HASH_SIZE;	// 2分割の実験用
 	//io_size = (((io_size + 2) / 3 + HASH_SIZE + (sse_unit - 1)) & ~(sse_unit - 1)) - HASH_SIZE;	// 3分割の実験用
@@ -742,21 +654,32 @@ int encode_method2(	// ソース・データを全て読み込む場合
 		err = 1;
 		goto error_end;
 	}
+	//memset(buf, 0xFF, (size_t)file_off);	// 後から 0 埋めしてるかの実験用
 	p_buf = buf + (size_t)unit_size * source_num;	// パリティ・ブロックを部分的に記録する領域
 	hash = p_buf + (size_t)unit_size * part_num;
 	prog_base = (block_size + io_size - 1) / io_size;
-	prog_write = source_num >> 5;	// 計算で 97%、書き込みで 3% ぐらい
-	if (prog_write == 0)
-		prog_write = 1;
-	prog_base *= (__int64)(source_num + prog_write) * parity_num;	// 全体の断片の個数
+	prog_read = (parity_num + 31) / 32;	// 読み書きの経過をそれぞれ 3% ぐらいにする
+	prog_write = (source_num + 31) / 32;
+	prog_base *= (__int64)(source_num + prog_write) * parity_num + prog_read * source_num;	// 全体の断片の個数
 	len = try_cache_blocking(unit_size);
 	//len = ((len + 2) / 3 + (sse_unit - 1)) & ~(sse_unit - 1);	// 1/3の実験用
 	chunk_num = (unit_size + len - 1) / len;
+	cpu_num1 = 0;	// 読み込み中はスレッド数を減らす（シングル・スレッドの時は 0にする）
+	i = 1;
+	while (i * 2 <= cpu_num){	// 1=0, 2~3=1, 4~7=2, 8~15=3, 16~31=4, 32=5
+		cpu_num1++;
+		i *= 2;
+	}
+	if (cpu_num1 > part_num)
+		cpu_num1 = part_num;
+	src_max = cpu_cache & 0xFFFE;	// CPU cache 最適化のため、同時に処理するブロック数を制限する
+	if ((src_max < 8) || (cpu_num == 1))
+		src_max = 0x8000;	// 不明または少な過ぎる場合は、制限しない
 #ifdef TIMER
 	printf("\n read all source blocks, and keep some parity blocks\n");
 	printf("buffer size = %I64d MB, io_size = %d, split = %d\n", file_off >> 20, io_size, (block_size + io_size - 1) / io_size);
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, len, chunk_num);
-	printf("prog_base = %I64d, unit_size = %d, part_num = %d\n", prog_base, unit_size, part_num);
+	printf("cache: limit size = %d, chunk_size = %d, chunk_num = %d\n", cpu_flag & 0x7FFF0000, len, chunk_num);
+	printf("unit_size = %d, part_num = %d, cpu_num1 = %d, src_max = %d\n", unit_size, part_num, cpu_num1, src_max);
 #endif
 
 	if (io_size < block_size){	// スライスが分割される場合だけ、途中までのハッシュ値を保持する
@@ -780,7 +703,7 @@ int encode_method2(	// ソース・データを全て読み込む場合
 	th->buf = p_buf;
 	th->size = unit_size;
 	th->count = part_num;
-	th->off = len;	// キャッシュの最適化を試みる
+	th->len = len;	// キャッシュの最適化を試みる
 	for (j = 0; j < cpu_num; j++){	// サブ・スレッドごとに
 		hRun[j] = CreateEvent(NULL, FALSE, FALSE, NULL);	// Auto Reset にする
 		if (hRun[j] == NULL){
@@ -812,8 +735,6 @@ int encode_method2(	// ソース・データを全て読み込む場合
 		}
 		WaitForSingleObject(hEnd[j], INFINITE);	// 設定終了の合図を待つ (リセットしない)
 	}
-	// IO が延滞しないように、サブ・スレッド一つの優先度を下げる
-	SetThreadPriority(hSub[0], THREAD_PRIORITY_BELOW_NORMAL);
 
 	// ソース・ブロック断片を読み込んで、パリティ・ブロック断片を作成する
 	time_last = GetTickCount();
@@ -821,7 +742,7 @@ int encode_method2(	// ソース・データを全て読み込む場合
 	block_off = 0;
 	while (block_off < block_size){
 		th->size = 0;	// 1st encode
-		th->off = -1;	// まだ計算して無い印
+		src_off = -1;	// まだ計算して無い印
 
 		// ソース・ブロックを読み込む
 #ifdef TIMER
@@ -870,9 +791,15 @@ time_start = GetTickCount();
 read_count++;
 #endif
 
-				if (i + 1 < source_num){	// 最後のブロック以外なら
+				if (src_off < 0){
+					src_num = i + 1;	// 最後のブロックより前なら
+				} else {
+					src_num = i / (src_off + 1);	// だいたい何ブロック読むごとに計算が終わるか
+					src_num += i + 1;	// 次のブロック番号を足す
+				}
+				if (src_num < source_num){	// 読み込みが終わる前に計算が終わりそうなら
 					// サブ・スレッドの動作状況を調べる
-					j = WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, 0);
+					j = WaitForMultipleObjects(cpu_num1, hEnd, TRUE, 0);
 					if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
 						// 経過表示
 						prog_num += part_num;
@@ -884,22 +811,21 @@ read_count++;
 							time_last = GetTickCount();
 						}
 						// 計算終了したブロックの次から計算を開始する
-						th->off += 1;
-						if (th->off > 0){	// バッファーに読み込んだ時だけ計算する
-							while (s_blk[th->off].size <= block_off){
+						src_off += 1;
+						if (src_off > 0){	// バッファーに読み込んだ時だけ計算する
+							while (s_blk[src_off].size <= block_off){
 								prog_num += part_num;
-								th->off += 1;
+								src_off += 1;
 #ifdef TIMER
 skip_count++;
 #endif
 							}
 						}
-						th->buf = buf + (size_t)unit_size * th->off;
-						for (j = 0; j < part_num; j++)
-							factor1[j] = galois_power(constant[th->off], first_num + j);	// factor は定数行列の乗数になる
+						th->buf = buf + (size_t)unit_size * src_off;
+						th->off = src_off;
 						th->now = -1;	// 初期値 - 1
 						//_mm_sfence();
-						for (j = 0; j < (cpu_num + 1) / 2; j++){
+						for (j = 0; j < cpu_num1; j++){
 							ResetEvent(hEnd[j]);	// リセットしておく
 							SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 						}
@@ -908,6 +834,16 @@ skip_count++;
 			} else {
 				memset(buf + (size_t)unit_size * i, 0, unit_size);
 			}
+
+			// 経過表示
+			prog_num += prog_read;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
 		}
 		// 最後のソース・ファイルを閉じる
 		CloseHandle(hFile);
@@ -916,24 +852,23 @@ skip_count++;
 time_read += GetTickCount() - time_start;
 #endif
 
-		WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
-		th->off += 1;	// 計算を開始するソース・ブロックの番号
-		if (th->off > 0){
-			while (s_blk[th->off].size <= block_off){	// 計算不要なソース・ブロックはとばす
+		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
+		src_off += 1;	// 計算を開始するソース・ブロックの番号
+		if (src_off > 0){
+			while (s_blk[src_off].size <= block_off){	// 計算不要なソース・ブロックはとばす
 				prog_num += part_num;
-				th->off += 1;
+				src_off += 1;
 #ifdef TIMER
 skip_count++;
 #endif
 			}
-		} else {	// エラーや実験時以外は th->off は 0 にならない
-			memset(p_buf, 0, (size_t)unit_size * part_num);
 		}
+		// 1st encode しなかった場合（src_off = 0）は、2nd encode で生成ブロックをゼロ埋めする
 #ifdef TIMER
-		j = (th->off * 1000) / source_num;
-		printf("partial encode = %d / %d (%d.%d%%), read = %d, skip = %d\n", th->off, source_num, j / 10, j % 10, read_count, skip_count);
+		j = (src_off * 1000) / source_num;
+		printf("partial encode = %d / %d (%d.%d%%), read = %d, skip = %d\n", src_off, source_num, j / 10, j % 10, read_count, skip_count);
 		// ここまでのパリティ・ブロックのチェックサムを検証する
-/*		if (th->off > 0){
+/*		if (src_off > 0){
 			for (j = 0; j < part_num; j++){
 				checksum16_return(p_buf + (size_t)unit_size * j, hash, io_size);
 				if (memcmp(p_buf + ((size_t)unit_size * j + io_size), hash, HASH_SIZE) != 0){
@@ -953,53 +888,72 @@ skip_count++;
 			len = io_size;
 		}
 
-		// cover_num ごとに処理する
-		part_start = 0;
-		cover_num = part_num;	// part_num は cpu_num の倍数にすること
-		src_num = source_num - th->off;	// 一度に処理する量 (src_num > 0)
-		th->buf = buf + (size_t)unit_size * (th->off);
-		while (part_start < parity_num){
-			if (part_start == part_num){	// part_num 分の計算が終わったら
-				th->off = 0;	// 最初の計算以降は全てのソース・ブロックを対象にする
-				src_num = source_num;	// source_num - th->off
-				th->buf = buf;	// buf + (size_t)unit_size * (th->off);
-			}
-			if (part_start + cover_num > parity_num)
-				cover_num = parity_num - part_start;
-			//printf("part_start = %d, src_num = %d / %d, cover_num = %d\n", part_start, src_num, source_num, cover_num);
+		// part_now ごとに処理する
+		part_off = 0;
+		part_now = part_num;
+		while (part_off < parity_num){
+			if (part_off + part_now > parity_num)
+				part_now = parity_num - part_off;
 
 			// スレッドごとにパリティ・ブロックを計算する
-			th->size = cover_num;
-			th->count = part_start;
-			th->now = -1;	// 初期値 - 1
-			//_mm_sfence();
-			for (j = 0; j < cpu_num; j++){
-				ResetEvent(hEnd[j]);	// リセットしておく
-				SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
-			}
+			th->count = part_off;
+			th->size = part_now;
+			if (part_off > 0)
+				src_off = 0;	// 最初の計算以降は全てのソース・ブロックを対象にする
+			src_num = src_max;	// 一度に処理するソース・ブロックの数を制限する
+#ifdef TIMER
+			printf("part_off = %d, part_now = %d, src_off = %d\n", part_off, part_now, src_off);
+#endif
+			while (src_off < source_num){
+				// ソース・ブロックを何個ずつ処理するか
+				if (src_off + src_num * 2 - 1 >= source_num)
+					src_num = source_num - src_off;
+				//printf("src_off = %d, src_num = %d\n", src_off, src_num);
 
-			// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待ちながら、経過表示する
-			while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
-				// th-now が最高値なので、計算が終わってるのは th-now - cpu_num 個となる
-				j = th->now - cpu_num;
-				if (j < 0)
-					j = 0;
-				j /= chunk_num;	// chunk数で割ってブロック数にする
-				// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
-				if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
-					err = 2;
-					goto error_end;
+				th->buf = buf + (size_t)unit_size * src_off;
+				th->off = src_off;
+				th->len = src_num;
+				th->now = -1;	// 初期値 - 1
+				//_mm_sfence();
+				for (j = 0; j < cpu_num; j++){
+					ResetEvent(hEnd[j]);	// リセットしておく
+					SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 				}
-				time_last = GetTickCount();
+
+				// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待ちながら、経過表示する
+				while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
+					// th-now が最高値なので、計算が終わってるのは th-now + 1 - cpu_num 個となる
+					j = th->now + 1 - cpu_num;
+					if (j < 0)
+						j = 0;
+					j /= chunk_num;	// chunk数で割ってブロック数にする
+					// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
+					if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
+						err = 2;
+						goto error_end;
+					}
+					time_last = GetTickCount();
+				}
+
+				// 経過表示
+				prog_num += src_num * part_now;
+				if (GetTickCount() - time_last >= UPDATE_TIME){
+					if (print_progress((int)((prog_num * 1000) / prog_base))){
+						err = 2;
+						goto error_end;
+					}
+					time_last = GetTickCount();
+				}
+
+				src_off += src_num;
 			}
-			prog_num += src_num * cover_num;
 
 #ifdef TIMER
 time_start = GetTickCount();
 #endif
 			// パリティ・ブロックを書き込む
 			work_buf = p_buf;
-			for (i = part_start; i < part_start + cover_num; i++){
+			for (i = part_off; i < part_off + part_now; i++){
 				// パリティ・ブロックのチェックサムを検証する
 				checksum16_return(work_buf, hash, io_size);
 				if (memcmp(work_buf + io_size, hash, HASH_SIZE) != 0){
@@ -1047,13 +1001,12 @@ time_start = GetTickCount();
 time_write += GetTickCount() - time_start;
 #endif
 
-			part_start += part_num;	// 次のパリティ位置にする
+			part_off += part_num;	// 次のパリティ位置にする
 		}
 
 		block_off += io_size;
 	}
 	print_progress_done();	// 改行して行の先頭に戻しておく
-	//printf("prog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 	// ファイルごとにブロックの CRC-32 を検証する
 	memset(buf, 0, io_size);
@@ -1114,6 +1067,8 @@ time_write += GetTickCount() - time_start;
 #ifdef TIMER
 printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
 printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+if (prog_num != prog_base)
+	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
 
 error_end:
@@ -1150,11 +1105,11 @@ int encode_method3(	// パリティ・ブロックを全て保持して、一度
 	unsigned short *constant)
 {
 	unsigned char *buf = NULL, *p_buf;
-	unsigned short *factor1;
-	int err = 0, i, j, last_file, source_off, read_num, packet_off;
-	int src_num, chunk_num;
+	int err = 0, i, j, last_file, chunk_num;
+	int source_off, read_num, packet_off;
+	int cpu_num1, src_off, src_num, src_max;
 	unsigned int unit_size, len;
-	unsigned int time_last, prog_write;
+	unsigned int time_last, prog_read, prog_write;
 	__int64 prog_num = 0, prog_base;
 	size_t mem_size;
 	HANDLE hFile = NULL;
@@ -1163,11 +1118,10 @@ int encode_method3(	// パリティ・ブロックを全て保持して、一度
 	PHMD5 file_md_ctx, blk_md_ctx;
 
 	memset(hSub, 0, sizeof(HANDLE) * MAX_CPU);
-	factor1 = constant + source_num;
 	unit_size = (block_size + HASH_SIZE + (sse_unit - 1)) & ~(sse_unit - 1);	// チェックサムの分だけ増やす
 
 	// 作業バッファーを確保する
-	read_num = read_block_num(parity_num, 0, 1, sse_unit);	// ソース・ブロックを何個読み込むか
+	read_num = read_block_num(parity_num, 1, sse_unit);	// ソース・ブロックを何個読み込むか
 	if (read_num == 0){
 #ifdef TIMER
 		printf("cannot keep enough blocks, use another method\n");
@@ -1184,25 +1138,36 @@ int encode_method3(	// パリティ・ブロックを全て保持して、一度
 		err = 1;
 		goto error_end;
 	}
+	//memset(buf, 0xFF, mem_size);	// 後から 0 埋めしてるかの実験用
 	p_buf = buf + (size_t)unit_size * read_num;	// パリティ・ブロックを記録する領域
-	prog_write = source_num >> 5;	// 計算で 97%、書き込みで 3% ぐらい
-	if (prog_write == 0)
-		prog_write = 1;
-	prog_base = (__int64)(source_num + prog_write) * parity_num;	// ブロックの合計掛け算個数 + 書き込み回数
+	prog_read = (parity_num + 31) / 32;	// 読み書きの経過をそれぞれ 3% ぐらいにする
+	prog_write = (source_num + 31) / 32;
+	prog_base = (__int64)(source_num + prog_write) * parity_num + prog_read * source_num;	// ブロックの合計掛け算個数 + 読み書き回数
 	len = try_cache_blocking(unit_size);
 	chunk_num = (unit_size + len - 1) / len;
+	cpu_num1 = 0;	// 読み込み中はスレッド数を減らす（シングル・スレッドの時は 0にする）
+	i = 1;
+	while (i * 2 <= cpu_num){	// 1=0, 2~3=1, 4~7=2, 8~15=3, 16~31=4, 32=5
+		cpu_num1++;
+		i *= 2;
+	}
+	if (cpu_num1 > parity_num)
+		cpu_num1 = parity_num;
+	src_max = cpu_cache & 0xFFFE;	// CPU cache 最適化のため、同時に処理するブロック数を制限する
+	if ((src_max < 8) || (cpu_num == 1))
+		src_max = 0x8000;	// 不明または少な過ぎる場合は、制限しない
 #ifdef TIMER
 	printf("\n read some source blocks, and keep all parity blocks\n");
 	printf("buffer size = %Id MB, read_num = %d, round = %d\n", mem_size >> 20, read_num, (source_num + read_num - 1) / read_num);
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, len, chunk_num);
-	printf("prog_base = %I64d, unit_size = %d\n", prog_base, unit_size);
+	printf("cache: limit size = %d, chunk_size = %d, chunk_num = %d\n", cpu_flag & 0x7FFF0000, len, chunk_num);
+	printf("unit_size = %d, cpu_num1 = %d, src_max = %d\n", unit_size, cpu_num1, src_max);
 #endif
 
 	// マルチ・スレッドの準備をする
 	th->mat = constant;
 	th->buf = p_buf;
 	th->size = unit_size;
-	th->off = len;	// キャッシュの最適化を試みる
+	th->len = len;	// キャッシュの最適化を試みる
 	for (j = 0; j < cpu_num; j++){	// サブ・スレッドごとに
 		hRun[j] = CreateEvent(NULL, FALSE, FALSE, NULL);	// Auto Reset にする
 		if (hRun[j] == NULL){
@@ -1234,8 +1199,6 @@ int encode_method3(	// パリティ・ブロックを全て保持して、一度
 		}
 		WaitForSingleObject(hEnd[j], INFINITE);	// 設定終了の合図を待つ (リセットしない)
 	}
-	// IO が延滞しないように、サブ・スレッド一つの優先度を下げる
-	SetThreadPriority(hSub[0], THREAD_PRIORITY_BELOW_NORMAL);
 
 	// 何回かに別けてソース・ブロックを読み込んで、パリティ・ブロックを少しずつ作成する
 	time_last = GetTickCount();
@@ -1246,7 +1209,7 @@ int encode_method3(	// パリティ・ブロックを全て保持して、一度
 		if (read_num > source_num - source_off)
 			read_num = source_num - source_off;
 		th->size = 0;	// 1st encode
-		th->off = source_off - 1;	// まだ計算して無い印
+		src_off = source_off - 1;	// まだ計算して無い印
 
 #ifdef TIMER
 time_start = GetTickCount();
@@ -1315,9 +1278,15 @@ time_start = GetTickCount();
 			packet_off += 20;
 			checksum16_altmap(buf + (size_t)unit_size * i, buf + ((size_t)unit_size * i + unit_size - HASH_SIZE), unit_size - HASH_SIZE);
 
-			if (i + 1 < read_num){	// 最後のブロック以外なら
+			if (src_off < 0){
+				src_num = i + 1;	// 最後のブロックより前なら
+			} else {
+				src_num = i / (src_off + 1);	// だいたい何ブロック読むごとに計算が終わるか
+				src_num += i + 1;	// 次のブロック番号を足す
+			}
+			if (src_num < read_num){	// 読み込みが終わる前に計算が終わりそうなら
 				// サブ・スレッドの動作状況を調べる
-				j = WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, 0);
+				j = WaitForMultipleObjects(cpu_num1, hEnd, TRUE, 0);
 				if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
 					// 経過表示
 					prog_num += parity_num;
@@ -1329,18 +1298,27 @@ time_start = GetTickCount();
 						time_last = GetTickCount();
 					}
 					// 計算終了したブロックの次から計算を開始する
-					th->off += 1;
-					th->buf = buf + (size_t)unit_size * (th->off - source_off);
-					for (j = 0; j < parity_num; j++)
-						factor1[j] = galois_power(constant[th->off], first_num + j);	// factor は定数行列の乗数になる
+					src_off += 1;
+					th->buf = buf + (size_t)unit_size * (src_off - source_off);
+					th->off = src_off;
 					th->now = -1;	// 初期値 - 1
 					//_mm_sfence();
-					for (j = 0; j < (cpu_num + 1) / 2; j++){
+					for (j = 0; j < cpu_num1; j++){
 						ResetEvent(hEnd[j]);	// リセットしておく
 						SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 					}
 				}
 			}
+
+			// 経過表示
+			prog_num += prog_read;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
 		}
 		if (source_off + i == source_num){	// 最後のソース・ファイルを閉じる
 			CloseHandle(hFile);
@@ -1369,15 +1347,15 @@ time_start = GetTickCount();
 time_read += GetTickCount() - time_start;
 #endif
 
-		WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
-		th->off += 1;	// 計算を開始するソース・ブロックの番号
-		if (th->off == 0)	// エラーや実験時以外は th->off は 0 にならない
+		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
+		src_off += 1;	// 計算を開始するソース・ブロックの番号
+		if (src_off == 0)	// 1st encode しなかった場合（src_off = 0）は、生成ブロックをゼロ埋めする
 			memset(p_buf, 0, (size_t)unit_size * parity_num);
 #ifdef TIMER
-		j = ((th->off - source_off) * 1000) / read_num;
-		printf("partial encode = %d / %d (%d.%d%%), source_off = %d\n", th->off - source_off, read_num, j / 10, j % 10, source_off);
+		j = ((src_off - source_off) * 1000) / read_num;
+		printf("partial encode = %d / %d (%d.%d%%), source_off = %d\n", src_off - source_off, read_num, j / 10, j % 10, source_off);
 		// ここまでのパリティ・ブロックのチェックサムを検証する
-/*		if (th->off - source_off > 0){
+/*		if (src_off - source_off > 0){
 			__declspec( align(16) ) unsigned char hash[HASH_SIZE];
 			for (j = 0; j < parity_num; j++){
 				checksum16_return(p_buf + (size_t)unit_size * j, hash, unit_size - HASH_SIZE);
@@ -1392,45 +1370,53 @@ time_read += GetTickCount() - time_start;
 #endif
 
 		// スレッドごとにパリティ・ブロックを計算する
-		src_num = read_num - (th->off - source_off);	// 一度に処理する量 (src_num > 0)
-		th->buf = buf + (size_t)unit_size * (th->off - source_off);
-		// th->off はソース・ブロックの番号
-		th->size = src_num;
-		th->now = -1;	// 初期値 - 1
-		//_mm_sfence();
-		for (j = 0; j < cpu_num; j++){
-			ResetEvent(hEnd[j]);	// リセットしておく
-			SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
-		}
+		src_num = src_max;	// 一度に処理するソース・ブロックの数を制限する
+		while (src_off < source_off + read_num){
+			// ソース・ブロックを何個ずつ処理するか
+			if (src_off + src_num * 2 - 1 >= source_off + read_num)
+				src_num = source_off + read_num - src_off;
+			//printf("src_off = %d, src_num = %d\n", src_off, src_num);
 
-		// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待ちながら、経過表示する
-		while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
-			// th-now が最高値なので、計算が終わってるのは th-now - cpu_num 個となる
-			j = th->now - cpu_num;
-			if (j < 0)
-				j = 0;
-			j /= chunk_num;	// chunk数で割ってブロック数にする
-			// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
-			if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
-				err = 2;
-				goto error_end;
+			th->buf = buf + (size_t)unit_size * (src_off - source_off);
+			th->off = src_off;	// ソース・ブロックの開始番号
+			th->size = src_num;
+			th->now = -1;	// 初期値 - 1
+			//_mm_sfence();
+			for (j = 0; j < cpu_num; j++){
+				ResetEvent(hEnd[j]);	// リセットしておく
+				SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 			}
-			time_last = GetTickCount();
-		}
 
-		// 経過表示
-		prog_num += src_num * parity_num;
-		if (GetTickCount() - time_last >= UPDATE_TIME){
-			if (print_progress((int)((prog_num * 1000) / prog_base))){
-				err = 2;
-				goto error_end;
+			// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待ちながら、経過表示する
+			while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
+				// th-now が最高値なので、計算が終わってるのは th-now + 1 - cpu_num 個となる
+				j = th->now + 1 - cpu_num;
+				if (j < 0)
+					j = 0;
+				j /= chunk_num;	// chunk数で割ってブロック数にする
+				// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
+				if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
 			}
-			time_last = GetTickCount();
+
+			// 経過表示
+			prog_num += src_num * parity_num;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
+
+			src_off += src_num;
 		}
 
 		source_off += read_num;
 	}
-	//printf("\nprog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 #ifdef TIMER
 time_start = GetTickCount();
@@ -1446,6 +1432,8 @@ time_write = GetTickCount() - time_start;
 #ifdef TIMER
 printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
 printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+if (prog_num != prog_base - prog_write * parity_num)
+	printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num);
 #endif
 
 error_end:
@@ -1476,11 +1464,10 @@ int encode_method4(	// 全てのブロックを断片的に保持する場合 (G
 	unsigned short *constant)	// 複数ブロック分の領域を確保しておく？
 {
 	unsigned char *buf = NULL, *p_buf, *work_buf, *hash;
-	unsigned short *factor1;
-	int err = 0, i, j, last_file;
-	int cpu_num1, cover_max, cover_from, cover_num;
+	int err = 0, i, j, last_file, chunk_num;
+	int cpu_num1, src_off, src_num, src_max, vram_max;
 	unsigned int io_size, unit_size, len, block_off;
-	unsigned int time_last, prog_write;
+	unsigned int time_last, prog_read, prog_write;
 	__int64 file_off, prog_num = 0, prog_base;
 	HANDLE hFile = NULL;
 	HANDLE hSub[MAX_CPU], hRun[MAX_CPU], hEnd[MAX_CPU];
@@ -1488,19 +1475,14 @@ int encode_method4(	// 全てのブロックを断片的に保持する場合 (G
 	PHMD5 md_ctx, *md_ptr = NULL;
 
 	memset(hSub, 0, sizeof(HANDLE) * MAX_CPU);
-	factor1 = constant + source_num;
-	cpu_num1 = cpu_num;	// 最後のスレッドを GPU 管理用にする
-	if (cpu_num == 1)
-		cpu_num1++;
 
-	// 作業バッファーを確保する（GPU の作業領域として2個の余裕を見ておく）
+	// 作業バッファーを確保する
 	// part_num を使わず、全てのブロックを保持する所がencode_method2と異なることに注意！
-	io_size = get_io_size(source_num + parity_num + 2, NULL, 1, MEM_UNIT);
+	io_size = get_io_size(source_num + parity_num, NULL, 1, MEM_UNIT);
 	//io_size = (((io_size + 1) / 2 + HASH_SIZE + (MEM_UNIT - 1)) & ~(MEM_UNIT - 1)) - HASH_SIZE;	// 2分割の実験用
 	//io_size = (((io_size + 2) / 3 + HASH_SIZE + (MEM_UNIT - 1)) & ~(MEM_UNIT - 1)) - HASH_SIZE;	// 3分割の実験用
 	unit_size = io_size + HASH_SIZE;	// チェックサムの分だけ増やす
-	file_off = (source_num + parity_num) * (size_t)unit_size + HASH_SIZE
-			+ (source_num * sizeof(unsigned short) * cpu_num1);
+	file_off = (source_num + parity_num) * (size_t)unit_size + HASH_SIZE;
 	buf = _aligned_malloc((size_t)file_off, MEM_UNIT);	// GPU 用の境界
 	if (buf == NULL){
 		printf("malloc, %I64d\n", file_off);
@@ -1510,13 +1492,28 @@ int encode_method4(	// 全てのブロックを断片的に保持する場合 (G
 	p_buf = buf + (size_t)unit_size * source_num;	// パリティ・ブロックを記録する領域
 	hash = p_buf + (size_t)unit_size * parity_num;
 	prog_base = (block_size + io_size - 1) / io_size;
-	prog_write = source_num >> 5;	// 計算で 97%、書き込みで 3% ぐらい
-	if (prog_write == 0)
-		prog_write = 1;
-	prog_base *= (__int64)(source_num + prog_write) * parity_num;	// 全体の断片の個数
+	prog_read = (parity_num + 31) / 32;	// 読み書きの経過をそれぞれ 3% ぐらいにする
+	prog_write = (source_num + 31) / 32;
+	prog_base *= (__int64)(source_num + prog_write) * parity_num + prog_read * source_num;	// 全体の断片の個数
+	len = try_cache_blocking(unit_size);
+	chunk_num = (unit_size + len - 1) / len;
+	cpu_num1 = 0;	// 読み込み中はスレッド数を減らす（シングル・スレッドの時は 0にする）
+	i = 1;
+	while (i * 2 <= cpu_num){	// 1=0, 2~3=1, 4~7=2, 8~15=3, 16~31=4, 32=5
+		cpu_num1++;
+		i *= 2;
+	}
+	if (cpu_num1 > parity_num)
+		cpu_num1 = parity_num;
+	//cpu_num1 = 0;	// 2nd encode の実験用に 1st encode を停止する
+	src_max = cpu_cache & 0xFFFE;	// CPU cache 最適化のため、同時に処理するブロック数を制限する
+	if ((src_max < 8) || (cpu_num <= 2))
+		src_max = 0x8000;	// 不明または少な過ぎる場合は、制限しない
 #ifdef TIMER
 	printf("\n read all source blocks, and keep all parity blocks (GPU)\n");
 	printf("buffer size = %I64d MB, io_size = %d, split = %d\n", file_off >> 20, io_size, (block_size + io_size - 1) / io_size);
+	printf("cache: limit size = %d, chunk_size = %d, chunk_num = %d\n", cpu_flag & 0x7FFF0000, len, chunk_num);
+	printf("unit_size = %d, cpu_num1 = %d, src_max = %d\n", unit_size, cpu_num1, src_max);
 #endif
 
 	if (io_size < block_size){	// スライスが分割される場合だけ、途中までのハッシュ値を保持する
@@ -1536,9 +1533,8 @@ int encode_method4(	// 全てのブロックを断片的に保持する場合 (G
 	}
 
 	// OpenCL の初期化
-	cover_max = source_num;
-	len = 0;
-	i = init_OpenCL(unit_size, &cover_max, &len);
+	vram_max = source_num;
+	i = init_OpenCL(unit_size, len, &vram_max);
 	if (i != 0){
 		if (i != 3)	// GPU が見つからなかった場合はエラー表示しない
 			printf("init_OpenCL, %d, %d\n", i & 0xFF, i >> 8);
@@ -1550,20 +1546,16 @@ int encode_method4(	// 全てのブロックを断片的に保持する場合 (G
 		err = -2;	// CPU だけの方式に切り替える
 		goto error_end;
 	}
-	if (len == 0)	// GPUがキャッシュを使わない時だけ、CPU独自にキャッシュの最適化を試みる
-		len = try_cache_blocking(unit_size);
 #ifdef TIMER
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, len, (unit_size + len - 1) / len);
-	printf("prog_base = %I64d, unit_size = %d, method = %d, cover_max = %d\n", prog_base, unit_size, OpenCL_method, cover_max);
+	printf("OpenCL_method = %d, vram_max = %d\n", OpenCL_method, vram_max);
 #endif
 
 	// マルチ・スレッドの準備をする
 	th->mat = constant;
 	th->buf = p_buf;
 	th->size = unit_size;
-	th->count = source_num;
-	th->off = len;	// chunk size
-	for (j = 0; j < cpu_num1; j++){	// サブ・スレッドごとに
+	th->len = len;	// chunk size
+	for (j = 0; j < cpu_num; j++){	// サブ・スレッドごとに
 		hRun[j] = CreateEvent(NULL, FALSE, FALSE, NULL);	// Auto Reset にする
 		if (hRun[j] == NULL){
 			print_win32_err();
@@ -1582,12 +1574,11 @@ int encode_method4(	// 全てのブロックを断片的に保持する場合 (G
 		// サブ・スレッドを起動する
 		th->run = hRun[j];
 		th->end = hEnd[j];
-		th->now = j;	// スレッド番号
 		//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを起動する
-		if ((j == cpu_num1 - 1) && (OpenCL_method != 0)){	// 最後のスレッドを GPU 管理用にする
+		if ((j == cpu_num - 1) && (OpenCL_method != 0)){	// 最後のスレッドを GPU 管理用にする
 			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_encode_gpu, (LPVOID)th, 0, NULL);
 		} else {
-			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_encode_each, (LPVOID)th, 0, NULL);
+			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_encode3, (LPVOID)th, 0, NULL);
 		}
 		if (hSub[j] == NULL){
 			print_win32_err();
@@ -1599,16 +1590,15 @@ int encode_method4(	// 全てのブロックを断片的に保持する場合 (G
 		}
 		WaitForSingleObject(hEnd[j], INFINITE);	// 設定終了の合図を待つ (リセットしない)
 	}
-	// IO が延滞しないように、サブ・スレッド一つの優先度を下げる
-	SetThreadPriority(hSub[0], THREAD_PRIORITY_BELOW_NORMAL);
+	th->len = 0;	// GPUのエラー通知用にする
 
 	// ソース・ブロック断片を読み込んで、パリティ・ブロック断片を作成する
 	time_last = GetTickCount();
 	wcscpy(file_path, base_dir);
 	block_off = 0;
 	while (block_off < block_size){
-		th->size = 0xFFFFFFFF;	// 1st encode
-		th->off = -1;	// まだ計算して無い印
+		th->size = 0;	// 1st encode
+		src_off = -1;	// まだ計算して無い印
 
 		// ソース・ブロックを読み込む
 #ifdef TIMER
@@ -1657,9 +1647,15 @@ time_start = GetTickCount();
 read_count++;
 #endif
 
-				if (i + 1 < source_num){	// 最後のブロック以外なら
+				if (src_off < 0){
+					src_num = i + 1;	// 最後のブロックより前なら
+				} else {
+					src_num = i / (src_off + 1);	// だいたい何ブロック読むごとに計算が終わるか
+					src_num += i + 1;	// 次のブロック番号を足す
+				}
+				if (src_num < source_num){	// 読み込みが終わる前に計算が終わりそうなら
 					// サブ・スレッドの動作状況を調べる
-					j = WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, 0);
+					j = WaitForMultipleObjects(cpu_num1, hEnd, TRUE, 0);
 					if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
 						// 経過表示
 						prog_num += parity_num;
@@ -1671,22 +1667,21 @@ read_count++;
 							time_last = GetTickCount();
 						}
 						// 計算終了したブロックの次から計算を開始する
-						th->off += 1;
-						if (th->off > 0){	// バッファーに読み込んだ時だけ計算する
-							while (s_blk[th->off].size <= block_off){
+						src_off += 1;
+						if (src_off > 0){	// バッファーに読み込んだ時だけ計算する
+							while (s_blk[src_off].size <= block_off){
 								prog_num += parity_num;
-								th->off += 1;
+								src_off += 1;
 #ifdef TIMER
 skip_count++;
 #endif
 							}
 						}
-						th->buf = buf + (size_t)unit_size * th->off;
-						for (j = 0; j < parity_num; j++)
-							factor1[j] = galois_power(constant[th->off], first_num + j);	// factor は定数行列の乗数になる
+						th->buf = buf + (size_t)unit_size * src_off;
+						th->off = src_off;
 						th->now = -1;	// 初期値 - 1
 						//_mm_sfence();
-						for (j = 0; j < (cpu_num + 1) / 2; j++){
+						for (j = 0; j < cpu_num1; j++){
 							ResetEvent(hEnd[j]);	// リセットしておく
 							SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 						}
@@ -1695,6 +1690,16 @@ skip_count++;
 			} else {
 				memset(buf + (size_t)unit_size * i, 0, unit_size);
 			}
+
+			// 経過表示
+			prog_num += prog_read;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
 		}
 		// 最後のソース・ファイルを閉じる
 		CloseHandle(hFile);
@@ -1703,23 +1708,22 @@ skip_count++;
 time_read += GetTickCount() - time_start;
 #endif
 
-		WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
-		th->size = 0;	// 2nd encode
-		th->off += 1;	// 計算を開始するソース・ブロックの番号
-		if (th->off > 0){
-			while (s_blk[th->off].size <= block_off){	// 計算不要なソース・ブロックはとばす
+		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
+		src_off += 1;	// 計算を開始するソース・ブロックの番号
+		if (src_off > 0){
+			while (s_blk[src_off].size <= block_off){	// 計算不要なソース・ブロックはとばす
 				prog_num += parity_num;
-				th->off += 1;
+				src_off += 1;
 #ifdef TIMER
 skip_count++;
 #endif
 			}
-		} else {	// エラーや実験時以外は th->off は 0 にならない
+		} else {	// 1st encode しなかった場合（src_off = 0）は、生成ブロックをゼロ埋めする
 			memset(p_buf, 0, (size_t)unit_size * parity_num);
 		}
 #ifdef TIMER
-		j = (th->off * 1000) / source_num;
-		printf("partial encode = %d (%d.%d%%), read = %d, skip = %d\n", th->off, j / 10, j % 10, read_count, skip_count);
+		j = (src_off * 1000) / source_num;
+		printf("partial encode = %d / %d (%d.%d%%), read = %d, skip = %d\n", src_off, source_num, j / 10, j % 10, read_count, skip_count);
 #endif
 
 		// リカバリ・ファイルに書き込むサイズ
@@ -1729,50 +1733,70 @@ skip_count++;
 			len = io_size;
 		}
 
-		// VRAM のサイズに応じて分割する
-		cover_from = th->off;
-		i = (source_num - cover_from + cover_max - 1) / cover_max;	// 何回に分けて処理するか
-		cover_num = (source_num - cover_from + i - 1) / i;	// 一度に処理する量を平均化する
-		//printf("cover range = %d, cover_num = %d\n", source_num - cover_from, cover_num);
-		while (cover_from < source_num){
+		// GPU と CPU のどちらに最適化するかが難しい
+		src_num = src_max;	// 一度に処理するソース・ブロックの数を制限する
+		if (src_num > vram_max){	// VRAM に収まらない場合は、VRAM のサイズに応じて分割する
+			src_num = vram_max & ~1;	// 減らして偶数にする（元が奇数なら分割数が増えるかも）
+			i = (source_num - src_off + src_num - 1) / src_num;	// 何回に分けて処理するか
+			src_num = (source_num - src_off + i - 1) / i;	// 一度に処理する量を平均化する
+			src_num = (src_num + 1) & ~1;	// 増やして偶数にする
+		}
+#ifdef TIMER
+		printf("remain = %d, src_off = %d, src_num = %d\n", source_num - src_off, src_off, src_num);
+#endif
+		while (src_off < source_num){
 			// ソース・ブロックを何個ずつ処理するか
-			if (cover_from + cover_num > source_num)
-				cover_num = source_num - cover_from;
-			//printf("cover_from = %d, cover_num = %d\n", cover_from, cover_num);
+			if (src_off + src_num > source_num){
+				src_num = source_num - src_off;
+#ifdef TIMER
+				printf("last1: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+			} else if (src_off + src_num * 2 - 1 >= source_num){
+				src_num = source_num - src_off;
+				if (src_num > vram_max){	// VRAM のサイズまでにする
+					src_num = (src_num + 1) / 2;	// 半分にする
+					src_num = (src_num + 1) & ~1;	// 偶数にする
+				}
+#ifdef TIMER
+				printf("last2: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+			}
 
 			// GPU と CPU がスレッドごとにパリティ・ブロックを計算する
-			th->buf = buf + (size_t)unit_size * cover_from;
-			th->off = cover_from;	// ソース・ブロックの番号にする
-			th->count = cover_num;
+			th->buf = buf + (size_t)unit_size * src_off;
+			th->off = src_off;	// ソース・ブロックの番号にする
+			th->size = src_num;
 			th->now = -1;	// 初期値 - 1
 			//_mm_sfence();
-			for (j = 0; j < cpu_num1; j++){
+			//for (j = cpu_num - 1; j >= 0; j--){	// GPU から先に計算を開始する？
+			for (j = 0; j < cpu_num; j++){
 				ResetEvent(hEnd[j]);	// リセットしておく
 				SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 			}
 
 			// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待ちながら、経過表示する
-			while (WaitForMultipleObjects(cpu_num1, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
-				// th-now が最高値なので、計算が終わってるのは th-now - cpu_num1 個となる
-				j = th->now - cpu_num1;
+			while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
+				// th-now が最高値なので、計算が終わってるのは th-now + 1 - cpu_num 個となる
+				j = th->now + 1 - cpu_num;
 				if (j < 0)
 					j = 0;
+				j /= chunk_num;	// chunk数で割ってブロック数にする
 				// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
-				if (print_progress((int)(((prog_num + cover_num * j) * 1000) / prog_base))){
+				if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
 					err = 2;
 					goto error_end;
 				}
 				time_last = GetTickCount();
 			}
-			if (th->size != 0){	// エラー発生
-				i = th->size;
+			if (th->len != 0){	// エラー発生
+				i = th->len;
 				printf("error, gpu-thread, %d, %d\n", i & 0xFF, i >> 8);
 				err = 1;
 				goto error_end;
 			}
 
 			// 経過表示
-			prog_num += cover_num * parity_num;
+			prog_num += src_num * parity_num;
 			if (GetTickCount() - time_last >= UPDATE_TIME){
 				if (print_progress((int)((prog_num * 1000) / prog_base))){
 					err = 2;
@@ -1781,7 +1805,7 @@ skip_count++;
 				time_last = GetTickCount();
 			}
 
-			cover_from += cover_num;
+			src_off += src_num;
 		}
 
 #ifdef TIMER
@@ -1840,15 +1864,14 @@ time_write += GetTickCount() - time_start;
 		block_off += io_size;
 	}
 	print_progress_done();	// 改行して行の先頭に戻しておく
-	//printf("prog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 	// ファイルごとにブロックの CRC-32 を検証する
 	memset(buf, 0, io_size);
 	j = 0;
 	while (j < source_num){
 		last_file = s_blk[j].file;
-		cover_num = (int)((files[last_file].size + (__int64)block_size - 1) / block_size);
-		i = j + cover_num - 1;	// 末尾ブロックの番号
+		src_num = (int)((files[last_file].size + (__int64)block_size - 1) / block_size);
+		i = j + src_num - 1;	// 末尾ブロックの番号
 		if (s_blk[i].size < block_size){	// 残りを 0 でパディングする
 			len = block_size - s_blk[i].size;
 			while (len > io_size){
@@ -1858,14 +1881,14 @@ time_write += GetTickCount() - time_start;
 			s_blk[i].crc = crc_update(s_blk[i].crc, buf, len);
 		}
 		memset(hash, 0, 16);
-		for (i = 0; i < cover_num; i++)	// XOR して 16バイトに減らす
+		for (i = 0; i < src_num; i++)	// XOR して 16バイトに減らす
 			((unsigned int *)hash)[i & 3] ^= s_blk[j + i].crc ^ 0xFFFFFFFF;
 		if (memcmp(files[last_file].hash, hash, 16) != 0){
 			printf("checksum mismatch, input file %d\n", last_file);
 			err = 1;
 			goto error_end;
 		}
-		j += cover_num;
+		j += src_num;
 	}
 
 	//printf("io_size = %d, block_size = %d\n", io_size, block_size);
@@ -1901,12 +1924,14 @@ time_write += GetTickCount() - time_start;
 #ifdef TIMER
 printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
 printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+if (prog_num != prog_base)
+	printf(" prog_num = %I64d, prog_base = %I64d\n", prog_num, prog_base);
 #endif
 	info_OpenCL(buf, MEM_UNIT);	// デバイス情報を表示する
 
 error_end:
 	InterlockedExchange(&(th->now), INT_MAX / 2);	// サブ・スレッドの計算を中断する
-	for (j = 0; j < cpu_num1; j++){
+	for (j = 0; j < cpu_num; j++){
 		if (hSub[j]){	// サブ・スレッドを終了させる
 			SetEvent(hRun[j]);
 			WaitForSingleObject(hSub[j], INFINITE);
@@ -1941,11 +1966,11 @@ int encode_method5(	// ソース・ブロックの一部とパリティ・ブロ
 	unsigned short *constant)
 {
 	unsigned char *buf = NULL, *p_buf;
-	unsigned short *factor1;
-	int err = 0, i, j, last_file, source_off, read_num, packet_off;
-	int cpu_num1, cover_max, cover_from, cover_num;
+	int err = 0, i, j, last_file, chunk_num;
+	int source_off, read_num, packet_off;
+	int cpu_num1, src_off, src_num, src_max, vram_max;
 	unsigned int unit_size, len;
-	unsigned int time_last, prog_write;
+	unsigned int time_last, prog_read, prog_write;
 	__int64 prog_num = 0, prog_base;
 	size_t mem_size;
 	HANDLE hFile = NULL;
@@ -1954,14 +1979,10 @@ int encode_method5(	// ソース・ブロックの一部とパリティ・ブロ
 	PHMD5 file_md_ctx, blk_md_ctx;
 
 	memset(hSub, 0, sizeof(HANDLE) * MAX_CPU);
-	factor1 = constant + source_num;
 	unit_size = (block_size + HASH_SIZE + (MEM_UNIT - 1)) & ~(MEM_UNIT - 1);	// MEM_UNIT の倍数にする
-	cpu_num1 = cpu_num;	// 最後のスレッドを GPU 管理用にする
-	if (cpu_num == 1)
-		cpu_num1++;
 
-	// 作業バッファーを確保する（GPU の作業領域として2個の余裕を見ておく）
-	read_num = read_block_num(parity_num, 2, 1, MEM_UNIT);	// ソース・ブロックを何個読み込むか
+	// 作業バッファーを確保する
+	read_num = read_block_num(parity_num, 1, MEM_UNIT);	// ソース・ブロックを何個読み込むか
 	if (read_num == 0){
 #ifdef TIMER
 		printf("cannot keep enough blocks, use another method\n");
@@ -1970,8 +1991,7 @@ int encode_method5(	// ソース・ブロックの一部とパリティ・ブロ
 	}
 	//read_num = (read_num + 1) / 2 + 1;	// 2分割の実験用
 	//read_num = (read_num + 2) / 3 + 1;	// 3分割の実験用
-	mem_size = (size_t)(read_num + parity_num) * unit_size
-			+ (read_num * sizeof(unsigned short) * cpu_num1);
+	mem_size = (size_t)(read_num + parity_num) * unit_size;
 	buf = _aligned_malloc(mem_size, MEM_UNIT);	// GPU 用の境界
 	if (buf == NULL){
 		printf("malloc, %Id\n", mem_size);
@@ -1979,15 +1999,32 @@ int encode_method5(	// ソース・ブロックの一部とパリティ・ブロ
 		goto error_end;
 	}
 	p_buf = buf + (size_t)unit_size * read_num;	// パリティ・ブロックを記録する領域
-	prog_write = source_num >> 5;	// 計算で 97%、書き込みで 3% ぐらい
-	if (prog_write == 0)
-		prog_write = 1;
-	prog_base = (__int64)(source_num + prog_write) * parity_num;	// ブロックの合計掛け算個数 + 書き込み回数
+	prog_read = (parity_num + 31) / 32;	// 読み書きの経過をそれぞれ 3% ぐらいにする
+	prog_write = (source_num + 31) / 32;
+	prog_base = (__int64)(source_num + prog_write) * parity_num + prog_read * source_num;	// ブロックの合計掛け算個数 + 書き込み回数
+	len = try_cache_blocking(unit_size);
+	chunk_num = (unit_size + len - 1) / len;
+	cpu_num1 = 0;	// 読み込み中はスレッド数を減らす（シングル・スレッドの時は 0にする）
+	i = 1;
+	while (i * 2 <= cpu_num){	// 1=0, 2~3=1, 4~7=2, 8~15=3, 16~31=4, 32=5
+		cpu_num1++;
+		i *= 2;
+	}
+	if (cpu_num1 > parity_num)
+		cpu_num1 = parity_num;
+	src_max = cpu_cache & 0xFFFE;	// CPU cache 最適化のため、同時に処理するブロック数を制限する
+	if ((src_max < 8) || (cpu_num <= 2))
+		src_max = 0x8000;	// 不明または少な過ぎる場合は、制限しない
+#ifdef TIMER
+	printf("\n read some source blocks, and keep all parity blocks (GPU)\n");
+	printf("buffer size = %Id MB, read_num = %d, round = %d\n", mem_size >> 20, read_num, (source_num + read_num - 1) / read_num);
+	printf("cache: limit size = %d, chunk_size = %d, chunk_num = %d\n", cpu_flag & 0x7FFF0000, len, chunk_num);
+	printf("unit_size = %d, cpu_num1 = %d, src_max = %d\n", unit_size, cpu_num1, src_max);
+#endif
 
 	// OpenCL の初期化
-	cover_max = read_num;	// 読み込める分だけにする
-	len = 0;
-	i = init_OpenCL(unit_size, &cover_max, &len);
+	vram_max = read_num;	// 読み込める分だけにする
+	i = init_OpenCL(unit_size, len, &vram_max);
 	if (i != 0){
 		if (i != 3)	// GPU が見つからなかった場合はエラー表示しない
 			printf("init_OpenCL, %d, %d\n", i & 0xFF, i >> 8);
@@ -1999,23 +2036,17 @@ int encode_method5(	// ソース・ブロックの一部とパリティ・ブロ
 		err = -3;	// CPU だけの方式に切り替える
 		goto error_end;
 	}
-	if (len == 0)	// GPUがキャッシュを使わない時だけ、CPU独自にキャッシュの最適化を試みる
-		len = try_cache_blocking(unit_size);
-	print_progress_text(0, "Creating recovery slice");
 #ifdef TIMER
-	printf("\n read some source blocks, and keep all parity blocks (GPU)\n");
-	printf("buffer size = %Id MB, read_num = %d, round = %d\n", mem_size >> 20, read_num, (source_num + read_num - 1) / read_num);
-	printf("cache: limit size = %d, chunk_size = %d, split = %d\n", cpu_cache & 0x7FFF8000, len, (unit_size + len - 1) / len);
-	printf("prog_base = %I64d, unit_size = %d, method = %d, cover_max = %d\n", prog_base, unit_size, OpenCL_method, cover_max);
+	printf("OpenCL_method = %d, vram_max = %d\n", OpenCL_method, vram_max);
 #endif
+	print_progress_text(0, "Creating recovery slice");
 
 	// マルチ・スレッドの準備をする
 	th->mat = constant;
 	th->buf = p_buf;
 	th->size = unit_size;
-	th->count = read_num;
-	th->off = len;	// chunk size
-	for (j = 0; j < cpu_num1; j++){	// サブ・スレッドごとに
+	th->len = len;	// chunk size
+	for (j = 0; j < cpu_num; j++){	// サブ・スレッドごとに
 		hRun[j] = CreateEvent(NULL, FALSE, FALSE, NULL);	// Auto Reset にする
 		if (hRun[j] == NULL){
 			print_win32_err();
@@ -2034,12 +2065,11 @@ int encode_method5(	// ソース・ブロックの一部とパリティ・ブロ
 		// サブ・スレッドを起動する
 		th->run = hRun[j];
 		th->end = hEnd[j];
-		th->now = j;	// スレッド番号
 		//_mm_sfence();	// メモリーへの書き込みを完了してからスレッドを起動する
-		if ((j == cpu_num1 - 1) && (OpenCL_method != 0)){	// 最後のスレッドを GPU 管理用にする
+		if ((j == cpu_num - 1) && (OpenCL_method != 0)){	// 最後のスレッドを GPU 管理用にする
 			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_encode_gpu, (LPVOID)th, 0, NULL);
 		} else {
-			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_encode_each, (LPVOID)th, 0, NULL);
+			hSub[j] = (HANDLE)_beginthreadex(NULL, STACK_SIZE, thread_encode3, (LPVOID)th, 0, NULL);
 		}
 		if (hSub[j] == NULL){
 			print_win32_err();
@@ -2051,8 +2081,7 @@ int encode_method5(	// ソース・ブロックの一部とパリティ・ブロ
 		}
 		WaitForSingleObject(hEnd[j], INFINITE);	// 設定終了の合図を待つ (リセットしない)
 	}
-	// IO が延滞しないように、サブ・スレッド一つの優先度を下げる
-	SetThreadPriority(hSub[0], THREAD_PRIORITY_BELOW_NORMAL);
+	th->len = 0;	// GPUのエラー通知用にする
 
 	// 何回かに別けてソース・ブロックを読み込んで、パリティ・ブロックを少しずつ作成する
 	time_last = GetTickCount();
@@ -2062,8 +2091,8 @@ int encode_method5(	// ソース・ブロックの一部とパリティ・ブロ
 	while (source_off < source_num){
 		if (read_num > source_num - source_off)
 			read_num = source_num - source_off;
-		th->size = 0xFFFFFFFF;	// 1st encode
-		th->off = source_off - 1;	// まだ計算して無い印
+		th->size = 0;	// 1st encode
+		src_off = source_off - 1;	// まだ計算して無い印
 
 #ifdef TIMER
 time_start = GetTickCount();
@@ -2131,9 +2160,15 @@ time_start = GetTickCount();
 			packet_off += 20;
 			checksum16_altmap(buf + (size_t)unit_size * i, buf + ((size_t)unit_size * i + unit_size - HASH_SIZE), unit_size - HASH_SIZE);
 
-			if (i + 1 < read_num){	// 最後のブロック以外なら
+			if (src_off < 0){
+				src_num = i + 1;	// 最後のブロックより前なら
+			} else {
+				src_num = i / (src_off + 1);	// だいたい何ブロック読むごとに計算が終わるか
+				src_num += i + 1;	// 次のブロック番号を足す
+			}
+			if (src_num < read_num){	// 読み込みが終わる前に計算が終わりそうなら
 				// サブ・スレッドの動作状況を調べる
-				j = WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, 0);
+				j = WaitForMultipleObjects(cpu_num1, hEnd, TRUE, 0);
 				if ((j != WAIT_TIMEOUT) && (j != WAIT_FAILED)){	// 計算中でないなら
 					// 経過表示
 					prog_num += parity_num;
@@ -2145,18 +2180,27 @@ time_start = GetTickCount();
 						time_last = GetTickCount();
 					}
 					// 計算終了したブロックの次から計算を開始する
-					th->off += 1;
-					th->buf = buf + (size_t)unit_size * (th->off - source_off);
-					for (j = 0; j < parity_num; j++)
-						factor1[j] = galois_power(constant[th->off], first_num + j);	// factor は定数行列の乗数になる
+					src_off += 1;
+					th->buf = buf + (size_t)unit_size * (src_off - source_off);
+					th->off = src_off;
 					th->now = -1;	// 初期値 - 1
 					//_mm_sfence();
-					for (j = 0; j < (cpu_num + 1) / 2; j++){
+					for (j = 0; j < cpu_num1; j++){
 						ResetEvent(hEnd[j]);	// リセットしておく
 						SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 					}
 				}
 			}
+
+			// 経過表示
+			prog_num += prog_read;
+			if (GetTickCount() - time_last >= UPDATE_TIME){
+				if (print_progress((int)((prog_num * 1000) / prog_base))){
+					err = 2;
+					goto error_end;
+				}
+				time_last = GetTickCount();
+			}
 		}
 		if (source_off + i == source_num){	// 最後のソース・ファイルを閉じる
 			CloseHandle(hFile);
@@ -2185,60 +2229,79 @@ time_start = GetTickCount();
 time_read += GetTickCount() - time_start;
 #endif
 
-		WaitForMultipleObjects((cpu_num + 1) / 2, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
-		th->size = 0;	// 2nd encode
-		th->off += 1;	// 計算を開始するソース・ブロックの番号
-		if (th->off == 0)	// エラーや実験時以外は th->off は 0 にならない
+		WaitForMultipleObjects(cpu_num1, hEnd, TRUE, INFINITE);	// サブ・スレッドの計算終了の合図を待つ
+		src_off += 1;	// 計算を開始するソース・ブロックの番号
+		if (src_off == 0)	// 1st encode しなかった場合（src_off = 0）は、生成ブロックをゼロ埋めする
 			memset(p_buf, 0, (size_t)unit_size * parity_num);
 #ifdef TIMER
-		j = (th->off - source_off) * 1000 / read_num;
-		printf("partial encode = %d (%d.%d%%)\n", th->off - source_off, j / 10, j % 10);
+		j = (src_off - source_off) * 1000 / read_num;
+		printf("partial encode = %d / %d (%d.%d%%), source_off = %d\n", src_off - source_off, read_num, j / 10, j % 10, source_off);
 #endif
 
-		// VRAM のサイズに応じて分割する
-		cover_from = th->off - source_off;
-		i = (read_num - cover_from + cover_max - 1) / cover_max;	// 何回に分けて処理するか
-		cover_num = (read_num - cover_from + i - 1) / i;	// 一度に処理する量を平均化する
-		//printf("cover range = %d, cover_num = %d\n", read_num - cover_from, cover_num);
-		while (cover_from < read_num){
+		// GPU と CPU のどちらに最適化するかが難しい
+		src_off -= source_off;	// バッファー内でのソース・ブロックの位置にする
+		src_num = src_max;	// 一度に処理するソース・ブロックの数を制限する
+		if (src_num > vram_max){	// VRAM に収まらない場合は、VRAM のサイズに応じて分割する
+			src_num = vram_max & ~1;	// 減らして偶数にする（元が奇数なら分割数が増えるかも）
+			i = (read_num - src_off + src_num - 1) / src_num;	// 何回に分けて処理するか
+			src_num = (read_num - src_off + i - 1) / i;	// 一度に処理する量を平均化する
+			src_num = (src_num + 1) & ~1;	// 増やして偶数にする
+		}
+#ifdef TIMER
+		printf("remain = %d, src_off = %d, src_num = %d\n", read_num - src_off, src_off, src_num);
+#endif
+		while (src_off < read_num){
 			// ソース・ブロックを何個ずつ処理するか
-			if (cover_from + cover_num > read_num)
-				cover_num = read_num - cover_from;
-			//printf("cover_from = %d, cover_num = %d\n", cover_from, cover_num);
+			if (src_off + src_num > read_num){
+				src_num = read_num - src_off;
+#ifdef TIMER
+				printf("last1: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+			} else if (src_off + src_num * 2 - 1 >= read_num){
+				src_num = read_num - src_off;
+				if (src_num > vram_max){	// VRAM のサイズまでにする
+					src_num = (src_num + 1) / 2;	// 半分にする
+					src_num = (src_num + 1) & ~1;	// 偶数にする
+				}
+#ifdef TIMER
+				printf("last2: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+			}
 
 			// GPU と CPU がスレッドごとにパリティ・ブロックを計算する
-			th->buf = buf + (size_t)unit_size * cover_from;
-			th->off = source_off + cover_from;	// ソース・ブロックの番号にする
-			th->count = cover_num;
+			th->buf = buf + (size_t)unit_size * src_off;
+			th->off = source_off + src_off;	// ソース・ブロックの番号にする
+			th->size = src_num;
 			th->now = -1;	// 初期値 - 1
 			//_mm_sfence();
-			for (j = 0; j < cpu_num1; j++){
+			for (j = 0; j < cpu_num; j++){
 				ResetEvent(hEnd[j]);	// リセットしておく
 				SetEvent(hRun[j]);	// サブ・スレッドに計算を開始させる
 			}
 
 			// サブ・スレッドの計算終了の合図を UPDATE_TIME だけ待ちながら、経過表示する
-			while (WaitForMultipleObjects(cpu_num1, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
-				// th-now が最高値なので、計算が終わってるのは th-now - cpu_num1 個となる
-				j = th->now - cpu_num1;
+			while (WaitForMultipleObjects(cpu_num, hEnd, TRUE, UPDATE_TIME) == WAIT_TIMEOUT){
+				// th-now が最高値なので、計算が終わってるのは th-now + 1 - cpu_num 個となる
+				j = th->now + 1 - cpu_num;
 				if (j < 0)
 					j = 0;
+				j /= chunk_num;	// chunk数で割ってブロック数にする
 				// 経過表示（UPDATE_TIME 時間待った場合なので、必ず経過してるはず）
-				if (print_progress((int)(((prog_num + cover_num * j) * 1000) / prog_base))){
+				if (print_progress((int)(((prog_num + src_num * j) * 1000) / prog_base))){
 					err = 2;
 					goto error_end;
 				}
 				time_last = GetTickCount();
 			}
-			if (th->size != 0){	// エラー発生
-				i = th->size;
+			if (th->len != 0){	// エラー発生
+				i = th->len;
 				printf("error, gpu-thread, %d, %d\n", i & 0xFF, i >> 8);
 				err = 1;
 				goto error_end;
 			}
 
 			// 経過表示
-			prog_num += cover_num * parity_num;
+			prog_num += src_num * parity_num;
 			if (GetTickCount() - time_last >= UPDATE_TIME){
 				if (print_progress((int)((prog_num * 1000) / prog_base))){
 					err = 2;
@@ -2247,12 +2310,11 @@ time_read += GetTickCount() - time_start;
 				time_last = GetTickCount();
 			}
 
-			cover_from += cover_num;
+			src_off += src_num;
 		}
 
 		source_off += read_num;
 	}
-	//printf("\nprog_num = %I64d / %I64d\n", prog_num, prog_base);
 
 #ifdef TIMER
 time_start = GetTickCount();
@@ -2268,12 +2330,14 @@ time_write = GetTickCount() - time_start;
 #ifdef TIMER
 printf("read   %d.%03d sec\n", time_read / 1000, time_read % 1000);
 printf("write  %d.%03d sec\n", time_write / 1000, time_write % 1000);
+if (prog_num != prog_base - prog_write * parity_num)
+	printf(" prog_num = %I64d != %I64d\n", prog_num, prog_base - prog_write * parity_num);
 #endif
 	info_OpenCL(buf, MEM_UNIT);	// デバイス情報を表示する
 
 error_end:
 	InterlockedExchange(&(th->now), INT_MAX / 2);	// サブ・スレッドの計算を中断する
-	for (j = 0; j < cpu_num1; j++){
+	for (j = 0; j < cpu_num; j++){
 		if (hSub[j]){	// サブ・スレッドを終了させる
 			SetEvent(hRun[j]);
 			WaitForSingleObject(hSub[j], INFINITE);
diff --git a/source/par2j/source.cl b/source/par2j/source.cl
index 78aa92c..0769e85 100644
--- a/source/par2j/source.cl
+++ b/source/par2j/source.cl
@@ -18,7 +18,9 @@ __kernel void method1(
 	__global uint *src,
 	__global uint *dst,
 	__global ushort *factors,
-	int blk_num)
+	int blk_num,
+	int offset,
+	int length)
 {
 	__local uint mtab[512];
 	int i, blk;
@@ -27,14 +29,15 @@ __kernel void method1(
 	const int work_size = get_global_size(0);
 	const int table_id = get_local_id(0);
 
-	for (i = work_id; i < BLK_SIZE; i += work_size)
+	src += offset;
+	for (i = work_id; i < length; i += work_size)
 		dst[i] = 0;
 
 	for (blk = 0; blk < blk_num; blk++){
 		calc_table(mtab, table_id, factors[blk]);
 		barrier(CLK_LOCAL_MEM_FENCE);
 
-		for (i = work_id; i < BLK_SIZE; i += work_size){
+		for (i = work_id; i < length; i += work_size){
 			v = src[i];
 			sum = mtab[(uchar)(v >> 16)] ^ mtab[256 + (v >> 24)];
 			sum <<= 16;
@@ -50,7 +53,9 @@ __kernel void method2(
 	__global uint *src,
 	__global uint *dst,
 	__global ushort *factors,
-	int blk_num)
+	int blk_num,
+	int offset,
+	int length)
 {
 	__local uint mtab[512];
 	int i, blk, pos;
@@ -59,7 +64,8 @@ __kernel void method2(
 	const int work_size = get_global_size(0) * 2;
 	const int table_id = get_local_id(0);
 
-	for (i = work_id; i < BLK_SIZE; i += work_size){
+	src += offset;
+	for (i = work_id; i < length; i += work_size){
 		dst[i    ] = 0;
 		dst[i + 1] = 0;
 	}
@@ -68,7 +74,7 @@ __kernel void method2(
 		calc_table(mtab, table_id, factors[blk]);
 		barrier(CLK_LOCAL_MEM_FENCE);
 
-		for (i = work_id; i < BLK_SIZE; i += work_size){
+		for (i = work_id; i < length; i += work_size){
 			pos = (i & ~7) + ((i & 7) >> 1);
 			lo = src[pos    ];
 			hi = src[pos + 4];
@@ -86,64 +92,13 @@ __kernel void method2(
 	}
 }
 
-__kernel void method3(
-	__global uint *src,
-	__global uint *dst,
-	__global ushort *factors,
-	int blk_num)
-{
-	__global uint *blk_src;
-	__local uint mtab[512];
-	int i, blk, chk_size, remain, pos;
-	uint lo, hi, sum1, sum2;
-	const int work_id = get_global_id(0) * 2;
-	const int work_size = get_global_size(0) * 2;
-	const int table_id = get_local_id(0);
-
-	remain = BLK_SIZE;
-	chk_size = CHK_SIZE;
-	while (remain > 0){
-		if (chk_size > remain)
-			chk_size = remain;
-
-		for (i = work_id; i < chk_size; i += work_size){
-			dst[i    ] = 0;
-			dst[i + 1] = 0;
-		}
-
-		blk_src = src;
-		for (blk = 0; blk < blk_num; blk++){
-			calc_table(mtab, table_id, factors[blk]);
-			barrier(CLK_LOCAL_MEM_FENCE);
-
-			for (i = work_id; i < chk_size; i += work_size){
-				pos = (i & ~7) + ((i & 7) >> 1);
-				lo = blk_src[pos    ];
-				hi = blk_src[pos + 4];
-				sum1 = mtab[(uchar)(lo >> 16)] ^ mtab[256 + (uchar)(hi >> 16)];
-				sum2 = mtab[lo >> 24] ^ mtab[256 + (hi >> 24)];
-				sum1 <<= 16;
-				sum2 <<= 16;
-				sum1 ^= mtab[(uchar)lo] ^ mtab[256 + (uchar)hi];
-				sum2 ^= mtab[(uchar)(lo >> 8)] ^ mtab[256 + (uchar)(hi >> 8)];
-				dst[pos    ] ^= (sum1 & 0x00FF00FF) | ((sum2 & 0x00FF00FF) << 8);
-				dst[pos + 4] ^= ((sum1 & 0xFF00FF00) >> 8) | (sum2 & 0xFF00FF00);
-			}
-			blk_src += BLK_SIZE;
-			barrier(CLK_LOCAL_MEM_FENCE);
-		}
-
-		src += CHK_SIZE;
-		dst += CHK_SIZE;
-		remain -= CHK_SIZE;
-	}
-}
-
 __kernel void method4(
 	__global uint *src,
 	__global uint *dst,
 	__global ushort *factors,
-	int blk_num)
+	int blk_num,
+	int offset,
+	int length)
 {
 	__local int table[16];
 	__local uint cache[256];
@@ -152,7 +107,8 @@ __kernel void method4(
 	const int work_id = get_global_id(0);
 	const int work_size = get_global_size(0);
 
-	for (i = work_id; i < BLK_SIZE; i += work_size)
+	src += offset;
+	for (i = work_id; i < length; i += work_size)
 		dst[i] = 0;
 
 	for (blk = 0; blk < blk_num; blk++){
@@ -166,7 +122,7 @@ __kernel void method4(
 		}
 		barrier(CLK_LOCAL_MEM_FENCE);
 
-		for (i = work_id; i < BLK_SIZE; i += work_size){
+		for (i = work_id; i < length; i += work_size){
 			pos = i & 255;
 			cache[pos] = src[i];
 			barrier(CLK_LOCAL_MEM_FENCE);
diff --git a/source/par2j/version.h b/source/par2j/version.h
index c9e0ab1..a7db363 100644
--- a/source/par2j/version.h
+++ b/source/par2j/version.h
@@ -1,2 +1,2 @@
-﻿#define FILE_VERSION "1.3.2.8"	// ファイルのバージョン番号
-#define PRODUCT_VERSION "1.3.2"	// 製品のバージョン番号
+﻿#define FILE_VERSION "1.3.3.0"	// ファイルのバージョン番号
+#define PRODUCT_VERSION "1.3.3"	// 製品のバージョン番号