Improve GPU function

This commit is contained in:
Yutaka Sawada
2023-10-23 10:54:28 +09:00
committed by GitHub
parent 82197ac0d0
commit fb72e811d0
18 changed files with 1022 additions and 546 deletions

View File

@@ -18,9 +18,7 @@ __kernel void method1(
__global uint *src,
__global uint *dst,
__global ushort *factors,
int blk_num,
int offset,
int length)
int blk_num)
{
__local uint mtab[512];
int i, blk;
@@ -29,15 +27,14 @@ __kernel void method1(
const int work_size = get_global_size(0);
const int table_id = get_local_id(0);
src += offset;
for (i = work_id; i < length; i += work_size)
for (i = work_id; i < BLK_SIZE; i += work_size)
dst[i] = 0;
for (blk = 0; blk < blk_num; blk++){
calc_table(mtab, table_id, factors[blk]);
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < length; i += work_size){
for (i = work_id; i < BLK_SIZE; i += work_size){
v = src[i];
sum = mtab[(uchar)(v >> 16)] ^ mtab[256 + (v >> 24)];
sum <<= 16;
@@ -53,9 +50,7 @@ __kernel void method2(
__global uint *src,
__global uint *dst,
__global ushort *factors,
int blk_num,
int offset,
int length)
int blk_num)
{
__local uint mtab[512];
int i, blk, pos;
@@ -64,8 +59,7 @@ __kernel void method2(
const int work_size = get_global_size(0) * 2;
const int table_id = get_local_id(0);
src += offset;
for (i = work_id; i < length; i += work_size){
for (i = work_id; i < BLK_SIZE; i += work_size){
dst[i ] = 0;
dst[i + 1] = 0;
}
@@ -74,7 +68,7 @@ __kernel void method2(
calc_table(mtab, table_id, factors[blk]);
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < length; i += work_size){
for (i = work_id; i < BLK_SIZE; i += work_size){
pos = (i & ~7) + ((i & 7) >> 1);
lo = src[pos ];
hi = src[pos + 4];
@@ -96,9 +90,7 @@ __kernel void method4(
__global uint *src,
__global uint *dst,
__global ushort *factors,
int blk_num,
int offset,
int length)
int blk_num)
{
__local int table[16];
__local uint cache[256];
@@ -107,8 +99,7 @@ __kernel void method4(
const int work_id = get_global_id(0);
const int work_size = get_global_size(0);
src += offset;
for (i = work_id; i < length; i += work_size)
for (i = work_id; i < BLK_SIZE; i += work_size)
dst[i] = 0;
for (blk = 0; blk < blk_num; blk++){
@@ -122,7 +113,7 @@ __kernel void method4(
}
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < length; i += work_size){
for (i = work_id; i < BLK_SIZE; i += work_size){
pos = i & 255;
cache[pos] = src[i];
barrier(CLK_LOCAL_MEM_FENCE);