Optimization for CPU cache

This commit is contained in:
Yutaka Sawada
2023-09-24 21:26:09 +09:00
committed by GitHub
parent 3024186aa6
commit 54931fc0e7
18 changed files with 2241 additions and 1296 deletions

View File

@@ -18,7 +18,9 @@ __kernel void method1(
__global uint *src,
__global uint *dst,
__global ushort *factors,
int blk_num)
int blk_num,
int offset,
int length)
{
__local uint mtab[512];
int i, blk;
@@ -27,14 +29,15 @@ __kernel void method1(
const int work_size = get_global_size(0);
const int table_id = get_local_id(0);
for (i = work_id; i < BLK_SIZE; i += work_size)
src += offset;
for (i = work_id; i < length; i += work_size)
dst[i] = 0;
for (blk = 0; blk < blk_num; blk++){
calc_table(mtab, table_id, factors[blk]);
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < BLK_SIZE; i += work_size){
for (i = work_id; i < length; i += work_size){
v = src[i];
sum = mtab[(uchar)(v >> 16)] ^ mtab[256 + (v >> 24)];
sum <<= 16;
@@ -50,7 +53,9 @@ __kernel void method2(
__global uint *src,
__global uint *dst,
__global ushort *factors,
int blk_num)
int blk_num,
int offset,
int length)
{
__local uint mtab[512];
int i, blk, pos;
@@ -59,7 +64,8 @@ __kernel void method2(
const int work_size = get_global_size(0) * 2;
const int table_id = get_local_id(0);
for (i = work_id; i < BLK_SIZE; i += work_size){
src += offset;
for (i = work_id; i < length; i += work_size){
dst[i ] = 0;
dst[i + 1] = 0;
}
@@ -68,7 +74,7 @@ __kernel void method2(
calc_table(mtab, table_id, factors[blk]);
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < BLK_SIZE; i += work_size){
for (i = work_id; i < length; i += work_size){
pos = (i & ~7) + ((i & 7) >> 1);
lo = src[pos ];
hi = src[pos + 4];
@@ -86,64 +92,13 @@ __kernel void method2(
}
}
__kernel void method3(
__global uint *src,
__global uint *dst,
__global ushort *factors,
int blk_num)
{
__global uint *blk_src;
__local uint mtab[512];
int i, blk, chk_size, remain, pos;
uint lo, hi, sum1, sum2;
const int work_id = get_global_id(0) * 2;
const int work_size = get_global_size(0) * 2;
const int table_id = get_local_id(0);
remain = BLK_SIZE;
chk_size = CHK_SIZE;
while (remain > 0){
if (chk_size > remain)
chk_size = remain;
for (i = work_id; i < chk_size; i += work_size){
dst[i ] = 0;
dst[i + 1] = 0;
}
blk_src = src;
for (blk = 0; blk < blk_num; blk++){
calc_table(mtab, table_id, factors[blk]);
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < chk_size; i += work_size){
pos = (i & ~7) + ((i & 7) >> 1);
lo = blk_src[pos ];
hi = blk_src[pos + 4];
sum1 = mtab[(uchar)(lo >> 16)] ^ mtab[256 + (uchar)(hi >> 16)];
sum2 = mtab[lo >> 24] ^ mtab[256 + (hi >> 24)];
sum1 <<= 16;
sum2 <<= 16;
sum1 ^= mtab[(uchar)lo] ^ mtab[256 + (uchar)hi];
sum2 ^= mtab[(uchar)(lo >> 8)] ^ mtab[256 + (uchar)(hi >> 8)];
dst[pos ] ^= (sum1 & 0x00FF00FF) | ((sum2 & 0x00FF00FF) << 8);
dst[pos + 4] ^= ((sum1 & 0xFF00FF00) >> 8) | (sum2 & 0xFF00FF00);
}
blk_src += BLK_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
}
src += CHK_SIZE;
dst += CHK_SIZE;
remain -= CHK_SIZE;
}
}
__kernel void method4(
__global uint *src,
__global uint *dst,
__global ushort *factors,
int blk_num)
int blk_num,
int offset,
int length)
{
__local int table[16];
__local uint cache[256];
@@ -152,7 +107,8 @@ __kernel void method4(
const int work_id = get_global_id(0);
const int work_size = get_global_size(0);
for (i = work_id; i < BLK_SIZE; i += work_size)
src += offset;
for (i = work_id; i < length; i += work_size)
dst[i] = 0;
for (blk = 0; blk < blk_num; blk++){
@@ -166,7 +122,7 @@ __kernel void method4(
}
barrier(CLK_LOCAL_MEM_FENCE);
for (i = work_id; i < BLK_SIZE; i += work_size){
for (i = work_id; i < length; i += work_size){
pos = i & 255;
cache[pos] = src[i];
barrier(CLK_LOCAL_MEM_FENCE);