From 978bbe4b40f0113f7ab06e0ce66c6104b763f8ec Mon Sep 17 00:00:00 2001
From: Yutaka Sawada <60930312+Yutaka-Sawada@users.noreply.github.com>
Date: Mon, 27 Nov 2023 14:19:43 +0900
Subject: [PATCH] Optimization for AMD GPU

---
 source/par2j/Command_par2j.txt |  4 +-
 source/par2j/lib_opencl.c      | 80 +++++++++++++++++++++++-----------
 source/par2j/res_par2j.rc      |  4 +-
 source/par2j/rs_decode.c       | 46 ++++++++++++++++---
 source/par2j/rs_encode.c       | 37 ++++++++++++----
 source/par2j/source.cl         | 42 ++++++++++++++++++
 source/par2j/version.h         |  2 +-
 7 files changed, 169 insertions(+), 46 deletions(-)

diff --git a/source/par2j/Command_par2j.txt b/source/par2j/Command_par2j.txt
index c229ff3..22e761f 100644
--- a/source/par2j/Command_par2j.txt
+++ b/source/par2j/Command_par2j.txt
@@ -370,8 +370,8 @@ The format is "/lc#" (# is from 1 to 32 as the number of using threads).
 255: It tries to use more threads than number of physical Cores.
 
  You may set additional combinations;
-+1024 to disable CLMUL (and use slower SSSE3 code), 
-+2048 to disable JIT (for SSE2), 
++1024 to disable CLMUL (and use slower SSSE3 code),
++2048 to disable JIT (for SSE2),
 +4096 to disable SSSE3,
 +8192 to disable AVX2,
 +256 or +512 (slower device) to enable GPU acceleration.
diff --git a/source/par2j/lib_opencl.c b/source/par2j/lib_opencl.c
index 471697a..ea4411a 100644
--- a/source/par2j/lib_opencl.c
+++ b/source/par2j/lib_opencl.c
@@ -1,5 +1,5 @@
 ﻿// lib_opencl.c
-// Copyright : 2023-10-22 Yutaka Sawada
+// Copyright : 2023-11-27 Yutaka Sawada
 // License : GPL
 
 #ifndef _WIN32_WINNT
@@ -115,7 +115,7 @@ int init_OpenCL(int unit_size, int *src_max)
 {
 	char buf[2048], *p_source;
 	int err = 0, i, j;
-	int gpu_power, count;
+	int gpu_power, count, gpu_flag;
 	size_t data_size, alloc_max;
 	//FILE *fp;
 	HRSRC res;
@@ -138,7 +138,7 @@ int init_OpenCL(int unit_size, int *src_max)
 	API_clGetKernelWorkGroupInfo fn_clGetKernelWorkGroupInfo;
 	cl_int ret;
 	cl_uint num_platforms = 0, num_devices = 0, num_groups, param_value;
-	cl_ulong param_value8;
+	cl_ulong param_value8, param_value4;
 	cl_platform_id platform_id[MAX_DEVICE], selected_platform;	// Intel, AMD, Nvidia などドライバーの提供元
 	cl_device_id device_id[MAX_DEVICE], selected_device;	// CPU や GPU など
 	cl_program program;
@@ -265,44 +265,43 @@ int init_OpenCL(int unit_size, int *src_max)
 			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_VERSION, sizeof(buf), buf, NULL);
 			if (ret == CL_SUCCESS)
 				printf("Device version = %s\n", buf);
-			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param_value8, NULL);
-			if (ret == CL_SUCCESS)
-				printf("LOCAL_MEM_SIZE = %I64d KB\n", param_value8 >> 10);
-
-			// 無理とは思うけど、一応チェックする
-//#define CL_DEVICE_SVM_CAPABILITIES                  0x1053
-//#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
-//#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
-//#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
-//#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
-//			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_ulong), &param_value8, NULL);
-//			if (ret == CL_INVALID_VALUE)
-//				printf("Shared Virtual Memory is not supported\n");
-//			if (ret == CL_SUCCESS)
-//				printf("Shared Virtual Memory = 0x%I64X\n", param_value8);
 #endif
 
+			// 取得できなくてもエラーにしない
+			param_value = 0;	// CL_DEVICE_HOST_UNIFIED_MEMORY は OpenCL 2.0 以降で非推奨になった
+			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_uint), &param_value, NULL);
+#ifdef DEBUG_OUTPUT
+			if (ret == CL_SUCCESS)
+				printf("HOST_UNIFIED_MEMORY = %d\n", param_value);
+#endif
+			if (param_value != 0)
+				param_value = 1;
+			param_value4 = 0;	// local memory が多い時だけ処理を変える
+			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &param_value4, NULL);
+#ifdef DEBUG_OUTPUT
+			if (ret == CL_SUCCESS)
+				printf("LOCAL_MEM_SIZE = %I64d KB\n", param_value4 >> 10);
+#endif
+
+			// 取得できない場合はエラーにする
 			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &param_value8, NULL);
 			if (ret != CL_SUCCESS)
 				continue;
 #ifdef DEBUG_OUTPUT
 			printf("MAX_MEM_ALLOC_SIZE = %I64d MB\n", param_value8 >> 20);
 #endif
-
 			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_groups, NULL);
 			if (ret != CL_SUCCESS)
 				continue;
 			ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &data_size, NULL);
 			if (ret != CL_SUCCESS)
 				continue;
-			// CL_DEVICE_HOST_UNIFIED_MEMORY は OpenCL 2.0 以降で非推奨になったので、参照しない
-
 #ifdef DEBUG_OUTPUT
 			printf("MAX_COMPUTE_UNITS = %d\n", num_groups);
 			printf("MAX_WORK_GROUP_SIZE = %zd\n", data_size);
 #endif
-			// MAX_COMPUTE_UNITS * MAX_WORK_GROUP_SIZE で計算力を測る
-			count = (int)data_size * num_groups;
+			// MAX_COMPUTE_UNITS * MAX_WORK_GROUP_SIZE で計算力を測る、外付けGPUなら値を倍にする
+			count = (2 - param_value) * (int)data_size * num_groups;
 			count *= OpenCL_method;	// 符号を変える
 			//printf("prev = %d, now = %d\n", gpu_power, count);
 			if ((count > gpu_power) && (data_size >= 256) &&	// 256以上ないとテーブルを作れない
@@ -312,6 +311,9 @@ int init_OpenCL(int unit_size, int *src_max)
 				selected_platform = platform_id[i];
 				OpenCL_group_num = num_groups;	// ワークグループ数は COMPUTE_UNITS 数にする
 				alloc_max = (size_t)param_value8;
+				gpu_flag = param_value;	// 0 = discrete GPU, 1 = integrated GPU
+				if (param_value4 >= 32768)
+					gpu_flag |= 2;	// local memory が 32KB 以上あるかどうか
 
 				// AMD や Intel の GPU ではメモリー領域が全体の 1/4 とは限らない
 				ret = fn_clGetDeviceInfo(device_id[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &param_value8, NULL);
@@ -355,7 +357,11 @@ int init_OpenCL(int unit_size, int *src_max)
 
 	// 計算方式を選択する
 	if ((((cpu_flag & 0x101) == 1) || ((cpu_flag & 0x110) == 0x10)) && (sse_unit == 32)){
-		OpenCL_method = 2;	// SSSE3 & ALTMAP または AVX2 ならデータの並び替え対応版を使う
+		if (gpu_flag & 2){
+			OpenCL_method = 3;	// local memory が 32KB 以上あれば 16-byte ずつアクセスする
+		} else {
+			OpenCL_method = 2;	// SSSE3 & ALTMAP または AVX2 ならデータの並び替え対応版を使う
+		}
 	} else if (((cpu_flag & 128) != 0) && (sse_unit == 256)){
 		OpenCL_method = 4;	// JIT(SSE2) は bit ごとに上位から 16バイトずつ並ぶ
 		// ローカルのテーブルサイズが異なることに注意
@@ -369,6 +375,9 @@ int init_OpenCL(int unit_size, int *src_max)
 	if (OpenCL_method == 2){
 		// work item 一個が 8バイトずつ計算する、256個なら work group ごとに 2KB 担当する
 		data_size = unit_size / 2048;
+	} else if (OpenCL_method == 3){
+		// work item 一個が 32バイトずつ計算する、256個なら work group ごとに 8KB 担当する
+		data_size = unit_size / 8192;
 	} else {
 		// work item 一個が 4バイトずつ計算する、256個なら work group ごとに 1KB 担当する
 		data_size = unit_size / 1024;
@@ -378,6 +387,17 @@ int init_OpenCL(int unit_size, int *src_max)
 		printf("Number of work groups is reduced to %zd\n", OpenCL_group_num);
 	}
 
+	// データへのアクセス方法をデバイスによって変える
+	if (gpu_flag & 1){
+		OpenCL_method |= 8;	// Integrated GPU なら CL_MEM_USE_HOST_PTR を使う
+	} else {	// Discrete GPU なら NVIDIA のだけ flag を変える
+		ret = fn_clGetDeviceInfo(selected_device, CL_DEVICE_VERSION, sizeof(buf), buf, NULL);
+		if (ret == CL_SUCCESS){
+			if (strstr(buf, "CUDA") != NULL)
+				OpenCL_method |= 8;	// NVIDIA GPU なら CL_MEM_USE_HOST_PTR を使う
+		}
+	}
+
 	// 最大で何ブロック分のメモリー領域を保持できるのか（ここではまだ確保しない）
 	// 後で実際に確保する量はこれよりも少なくなる
 	count = (int)(alloc_max / unit_size);	// 確保できるメモリー量から逆算する
@@ -668,11 +688,19 @@ int gpu_copy_blocks(
 {
 	size_t data_size;
 	cl_int ret;
+	cl_mem_flags flags;
 
 	// Integrated GPU と Discrete GPU の違いに関係なく、使う分だけ毎回メモリー領域を確保する
 	data_size = (size_t)unit_size * src_num;
-	// Intel GPUならZeroCopyできる、GeForce GPUでもメモリー消費量が少なくてコピーが速い
-	OpenCL_src = gfn_clCreateBuffer(OpenCL_context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, data_size, data, &ret);
+	if (OpenCL_method & 8){	// AMD's APU や Integrated GPU なら ZeroCopy する
+		// 実際に比較してみると GeForce GPU でもメモリー消費量が少なくてコピーが速い
+		// NVIDIA GPU は CL_MEM_USE_HOST_PTR でも VRAM 上にキャッシュするので速いらしい
+		flags = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR;
+	} else {	// Discrete GPU ならデータを VRAM にコピーする
+		// AMD GPU は明示的にコピーするよう指定しないといけない
+		flags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
+	}
+	OpenCL_src = gfn_clCreateBuffer(OpenCL_context, flags, data_size, data, &ret);
 	if (ret != CL_SUCCESS)
 		return (ret << 8) | 1;
 #ifdef DEBUG_OUTPUT
diff --git a/source/par2j/res_par2j.rc b/source/par2j/res_par2j.rc
index 3f6a941..ef90b13 100644
--- a/source/par2j/res_par2j.rc
+++ b/source/par2j/res_par2j.rc
@@ -1,7 +1,7 @@
 1 RT_STRING ".\\source.cl"
 
 1 VERSIONINFO
-FILEVERSION 1,3,3,1
+FILEVERSION 1,3,3,2
 PRODUCTVERSION 1,3,3,0
 FILEOS 0x40004
 FILETYPE 0x1
@@ -13,7 +13,7 @@ BLOCK "StringFileInfo"
 		VALUE "FileDescription", "PAR2 client"
 		VALUE "LegalCopyright", "Copyright (C) 2023 Yutaka Sawada"
 		VALUE "ProductName", "par2j"
-		VALUE "FileVersion", "1.3.3.1"
+		VALUE "FileVersion", "1.3.3.2"
 		VALUE "ProductVersion", "1.3.3.0"
 	}
 }
diff --git a/source/par2j/rs_decode.c b/source/par2j/rs_decode.c
index a6a5538..06870e3 100644
--- a/source/par2j/rs_decode.c
+++ b/source/par2j/rs_decode.c
@@ -1,5 +1,5 @@
 ﻿// rs_decode.c
-// Copyright : 2023-10-29 Yutaka Sawada
+// Copyright : 2023-11-25 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -1731,10 +1731,21 @@ skip_count++;
 #endif
 				} else if (src_off + src_num + src_max > source_num){
 					src_num = source_num - src_off - src_max;
-					if ((src_num < src_max) && (src_num + src_max <= vram_max) && (gpu_end * 2 > cpu_end)){
-						src_num += src_max;	// GPU担当量が少なくて、余裕がある場合は、残りも全て任せる
+					if (src_num < src_max){
+						if ((src_num + src_max <= vram_max) && (gpu_end * 2 > cpu_end)){
+							src_num += src_max;	// GPU担当量が少なくて、余裕がある場合は、残りも全て任せる
+#ifdef TIMER
+							printf("GPU last +: src_off = %d, src_num = %d + %d\n", src_off, src_num - src_max, src_max);
+#endif
+						} else if (src_num < src_max / 4){
+							src_num = src_max / 4;	// src_num が小さくなり過ぎないようにする
+#ifdef TIMER
+							printf("GPU last ?: src_off = %d, src_num = %d\n", src_off, src_num);
+						} else {
+							printf("GPU last -: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+						}
 #ifdef TIMER
-						printf("GPU last +: src_off = %d, src_num = %d + %d\n", src_off, src_num - src_max, src_max);
 					} else {
 						printf("GPU last 2: src_off = %d, src_num = %d\n", src_off, src_num);
 #endif
@@ -2318,10 +2329,31 @@ time_read += GetTickCount() - time_start;
 #endif
 				} else if (src_off + src_num + src_max > read_num){
 					src_num = read_num - src_off - src_max;
-					if ((src_num < src_max) && (src_num + src_max <= vram_max) && (gpu_end * 2 > cpu_end)){
-						src_num += src_max;	// GPU担当量が少なくて、余裕がある場合は、残りも全て任せる
+
+					if (src_num <= 0){
+						src_num = src_max / 4;	// src_num が 0にならないようにする
+#ifdef TIMER
+						printf("GPU last ?: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+					} else 
+
+
+
+					if (src_num < src_max){
+						if ((src_num + src_max <= vram_max) && (gpu_end * 2 > cpu_end)){
+							src_num += src_max;	// GPU担当量が少なくて、余裕がある場合は、残りも全て任せる
+#ifdef TIMER
+							printf("GPU last +: src_off = %d, src_num = %d + %d\n", src_off, src_num - src_max, src_max);
+#endif
+						} else if (src_num < src_max / 4){
+							src_num = src_max / 4;	// src_num が小さくなり過ぎないようにする
+#ifdef TIMER
+							printf("GPU last ?: src_off = %d, src_num = %d\n", src_off, src_num);
+						} else {
+							printf("GPU last -: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+						}
 #ifdef TIMER
-						printf("GPU last +: src_off = %d, src_num = %d + %d\n", src_off, src_num - src_max, src_max);
 					} else {
 						printf("GPU last 2: src_off = %d, src_num = %d\n", src_off, src_num);
 #endif
diff --git a/source/par2j/rs_encode.c b/source/par2j/rs_encode.c
index 98678e2..969a4a0 100644
--- a/source/par2j/rs_encode.c
+++ b/source/par2j/rs_encode.c
@@ -1,5 +1,5 @@
 ﻿// rs_encode.c
-// Copyright : 2023-10-29 Yutaka Sawada
+// Copyright : 2023-11-25 Yutaka Sawada
 // License : GPL
 
 #ifndef _UNICODE
@@ -1817,11 +1817,21 @@ skip_count++;
 #endif
 				} else if (src_off + src_num + src_max > source_num){
 					src_num = source_num - src_off - src_max;
-					// src_num が 0にならないように、src_num == src_max なら上の last1 にする
-					if ((src_num < src_max) && (src_num + src_max <= vram_max) && (gpu_end * 2 > cpu_end)){
-						src_num += src_max;	// GPU担当量が少なくて、余裕がある場合は、残りも全て任せる
+					if (src_num < src_max){
+						if ((src_num + src_max <= vram_max) && (gpu_end * 2 > cpu_end)){
+							src_num += src_max;	// GPU担当量が少なくて、余裕がある場合は、残りも全て任せる
+#ifdef TIMER
+							printf("GPU last +: src_off = %d, src_num = %d + %d\n", src_off, src_num - src_max, src_max);
+#endif
+						} else if (src_num < src_max / 4){
+							src_num = src_max / 4;	// src_num が小さくなり過ぎないようにする
+#ifdef TIMER
+							printf("GPU last ?: src_off = %d, src_num = %d\n", src_off, src_num);
+						} else {
+							printf("GPU last -: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+						}
 #ifdef TIMER
-						printf("GPU last +: src_off = %d, src_num = %d + %d\n", src_off, src_num - src_max, src_max);
 					} else {
 						printf("GPU last 2: src_off = %d, src_num = %d\n", src_off, src_num);
 #endif
@@ -2462,10 +2472,21 @@ time_read += GetTickCount() - time_start;
 #endif
 				} else if (src_off + src_num + src_max > read_num){
 					src_num = read_num - src_off - src_max;
-					if ((src_num < src_max) && (src_num + src_max <= vram_max) && (gpu_end * 2 > cpu_end)){
-						src_num += src_max;	// GPU担当量が少なくて、余裕がある場合は、残りも全て任せる
+					if (src_num < src_max){
+						if ((src_num + src_max <= vram_max) && (gpu_end * 2 > cpu_end)){
+							src_num += src_max;	// GPU担当量が少なくて、余裕がある場合は、残りも全て任せる
+#ifdef TIMER
+							printf("GPU last +: src_off = %d, src_num = %d + %d\n", src_off, src_num - src_max, src_max);
+#endif
+						} else if (src_num < src_max / 4){
+							src_num = src_max / 4;	// src_num が小さくなり過ぎないようにする
+#ifdef TIMER
+							printf("GPU last ?: src_off = %d, src_num = %d\n", src_off, src_num);
+						} else {
+							printf("GPU last -: src_off = %d, src_num = %d\n", src_off, src_num);
+#endif
+						}
 #ifdef TIMER
-						printf("GPU last +: src_off = %d, src_num = %d + %d\n", src_off, src_num - src_max, src_max);
 					} else {
 						printf("GPU last 2: src_off = %d, src_num = %d\n", src_off, src_num);
 #endif
diff --git a/source/par2j/source.cl b/source/par2j/source.cl
index e5acc46..11279d3 100644
--- a/source/par2j/source.cl
+++ b/source/par2j/source.cl
@@ -86,6 +86,48 @@ __kernel void method2(
 	}
 }
 
+__kernel void method3(
+	__global uint4 *src,
+	__global uint4 *dst,
+	__global ushort *factors,
+	int blk_num)
+{
+	__local uint mtab[512];
+	int i, blk;
+	uchar4 r0, r1, r2, r3, r4, r5, r6, r7;
+	uchar16 lo, hi;
+	const int work_id = get_global_id(0) * 2;
+	const int work_size = get_global_size(0) * 2;
+	const int table_id = get_local_id(0);
+
+	for (i = work_id; i < BLK_SIZE / 4; i += work_size){
+		dst[i    ] = 0;
+		dst[i + 1] = 0;
+	}
+
+	for (blk = 0; blk < blk_num; blk++){
+		calc_table(mtab, table_id, factors[blk]);
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		for (i = work_id; i < BLK_SIZE / 4; i += work_size){
+			lo = as_uchar16(src[i    ]);
+			hi = as_uchar16(src[i + 1]);
+			r0 = (uchar4)(as_uchar2((ushort)(mtab[lo.s0] ^ mtab[256 + hi.s0])), as_uchar2((ushort)(mtab[lo.s1] ^ mtab[256 + hi.s1])));
+			r1 = (uchar4)(as_uchar2((ushort)(mtab[lo.s2] ^ mtab[256 + hi.s2])), as_uchar2((ushort)(mtab[lo.s3] ^ mtab[256 + hi.s3])));
+			r2 = (uchar4)(as_uchar2((ushort)(mtab[lo.s4] ^ mtab[256 + hi.s4])), as_uchar2((ushort)(mtab[lo.s5] ^ mtab[256 + hi.s5])));
+			r3 = (uchar4)(as_uchar2((ushort)(mtab[lo.s6] ^ mtab[256 + hi.s6])), as_uchar2((ushort)(mtab[lo.s7] ^ mtab[256 + hi.s7])));
+			r4 = (uchar4)(as_uchar2((ushort)(mtab[lo.s8] ^ mtab[256 + hi.s8])), as_uchar2((ushort)(mtab[lo.s9] ^ mtab[256 + hi.s9])));
+			r5 = (uchar4)(as_uchar2((ushort)(mtab[lo.sa] ^ mtab[256 + hi.sa])), as_uchar2((ushort)(mtab[lo.sb] ^ mtab[256 + hi.sb])));
+			r6 = (uchar4)(as_uchar2((ushort)(mtab[lo.sc] ^ mtab[256 + hi.sc])), as_uchar2((ushort)(mtab[lo.sd] ^ mtab[256 + hi.sd])));
+			r7 = (uchar4)(as_uchar2((ushort)(mtab[lo.se] ^ mtab[256 + hi.se])), as_uchar2((ushort)(mtab[lo.sf] ^ mtab[256 + hi.sf])));
+			dst[i    ] ^= as_uint4((uchar16)(r0.x, r0.z, r1.x, r1.z, r2.x, r2.z, r3.x, r3.z, r4.x, r4.z, r5.x, r5.z, r6.x, r6.z, r7.x, r7.z));
+			dst[i + 1] ^= as_uint4((uchar16)(r0.y, r0.w, r1.y, r1.w, r2.y, r2.w, r3.y, r3.w, r4.y, r4.w, r5.y, r5.w, r6.y, r6.w, r7.y, r7.w));
+		}
+		src += BLK_SIZE / 4;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+}
+
 __kernel void method4(
 	__global uint *src,
 	__global uint *dst,
diff --git a/source/par2j/version.h b/source/par2j/version.h
index 5558576..c5a6049 100644
--- a/source/par2j/version.h
+++ b/source/par2j/version.h
@@ -1,2 +1,2 @@
-﻿#define FILE_VERSION "1.3.3.1"	// ファイルのバージョン番号
+﻿#define FILE_VERSION "1.3.3.2"	// ファイルのバージョン番号
 #define PRODUCT_VERSION "1.3.3"	// 製品のバージョン番号