Interesting... you decompiled or extracted my code I see ... so we get Nvidia update for your miner soon?
To open it up: my OpenCL code uses the Blake2B routine from older ZCash miner zogminer. You can find the code here:
There you also find the comment snipped you cited. This miner is under MIT license, so I think I am fine to use it. But you are right the nheqminer has same comment in, but is CUDA and not OpenCL.
Since you should not be only one to see it, here is the full blake2b code of lolMiner 0.33:
void round0(ulong8 blake_state, __global uint4 *results, __local uint *counter, uint tid, uint gId) {
if (tid > 26214) return;
ulong v[16];
// shift "i" to occupy the high 32 bits of the second ulong word in the
// message block. The lower will be used to modify the nounce
ulong word1 = ((ulong)tid << 32) | gId;
// init vector v
v[0] = blake_state.s0;
v[1] = blake_state.s1;
v[2] = blake_state.s2;
v[3] = blake_state.s3;
v[4] = blake_state.s4;
v[5] = blake_state.s5;
v[6] = blake_state.s6;
v[7] = blake_state.s7;
v[8] = blake_iv[0];
v[9] = blake_iv[1];
v[10] = blake_iv[2];
v[11] = blake_iv[3];
v[12] = blake_iv[4];
v[13] = blake_iv[5];
v[14] = blake_iv[6];
v[15] = blake_iv[7];
// gFunc in length of data
v[12] ^= 144 /* length of "i" */;
// last block
v[14] ^= (ulong)-1;
// round 1
gFunc(v[0], v[4], v[8], v[12], 0, word1);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], 0, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 2
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], word1, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 3
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], 0, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, word1);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 4
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], 0, word1);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], 0, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 5
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], 0, word1);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 6
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], 0, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], word1, 0);
// round 7
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], word1, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], 0, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 8
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, word1);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], 0, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 9
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], 0, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], word1, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 10
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], word1, 0);
gFunc(v[0], v[5], v[10], v[15], 0, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 11
gFunc(v[0], v[4], v[8], v[12], 0, word1);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], 0, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
// round 12
gFunc(v[0], v[4], v[8], v[12], 0, 0);
gFunc(v[1], v[5], v[9], v[13], 0, 0);
gFunc(v[2], v[6], v[10], v[14], 0, 0);
gFunc(v[3], v[7], v[11], v[15], 0, 0);
gFunc(v[0], v[5], v[10], v[15], word1, 0);
gFunc(v[1], v[6], v[11], v[12], 0, 0);
gFunc(v[2], v[7], v[8], v[13], 0, 0);
gFunc(v[3], v[4], v[9], v[14], 0, 0);
v[0] = v[0] ^ blake_state.s0 ^ v[8];
v[1] = v[1] ^ blake_state.s1 ^ v[9];
v[2] = v[2] ^ blake_state.s2 ^ v[10];
v[3] = v[3] ^ blake_state.s3 ^ v[11];
v[4] = v[4] ^ blake_state.s4 ^ v[12];
v[5] = v[5] ^ blake_state.s5 ^ v[13];
v[6] = v[6] ^ blake_state.s6 ^ v[14];
v[7] = v[7] ^ blake_state.s7 ^ v[15];
uint4 output;
uint bucket; uint pos;
output.s0 = 5*tid;
output.s1 = (v[0] & 0xFFFFFFFF);
output.s2 = (v[0] >> 32);
output.s3 = (v[1] & 0xFFFFFFFF);
getBucket(bucket, output.s1);
getRem(output.s1, output.s1);
pos = atomic_inc(&counter[bucket]);
if (pos < bSize) results[bucket*bSize + pos] = output;
output.s0 = 5*tid+1;
output.s1 = (v[1] >> 32);
output.s2 = (v[2] & 0xFFFFFFFF);
output.s3 = (v[2] >> 32);
getBucket(bucket, output.s1);
getRem(output.s1, output.s1);
pos = atomic_inc(&counter[bucket]);
if (pos < bSize) results[bucket*bSize + pos] = output;
output.s0 = 5*tid+2;
output.s1 = (v[3] & 0xFFFFFFFF);
output.s2 = (v[3] >> 32);
output.s3 = (v[4] & 0xFFFFFFFF);
if (output.s0 < 131072) {
getBucket(bucket, output.s1);
getRem(output.s1, output.s1);
pos = atomic_inc(&counter[bucket]);
if (pos < bSize) results[bucket*bSize + pos] = output;
}
output.s0 = 5*tid+3;
output.s1 = (v[4] >> 32);
output.s2 = (v[5] & 0xFFFFFFFF);
output.s3 = (v[5] >> 32);
if (output.s0 < 131072) {
getBucket(bucket, output.s1);
getRem(output.s1, output.s1);
pos = atomic_inc(&counter[bucket]);
if (pos < bSize) results[bucket*bSize + pos] = output;
}
output.s0 = 5*tid+4;
output.s1 = (v[6] & 0xFFFFFFFF);
output.s2 = (v[6] >> 32);
output.s3 = (v[7] & 0xFFFFFFFF);
if (output.s0 < 131072) {
getBucket(bucket, output.s1);
getRem(output.s1, output.s1);
pos = atomic_inc(&counter[bucket]);
if (pos < bSize) results[bucket*bSize + pos] = output;
}
}
I hope you see that this is taken from Zogminer but modified for Minex. Also nheqminer has a similar phrase in also under MIT. To other sources I did not look.
By the way, how do you like my round code? That one is due to my own and I like its simplicity
Question is now what should I do with you. I do not like binary kernels as you use because they make the miners incompatible, i.e. I could not support Vega with binary kernels. But if you found a simple way of extracting my OpenCL text form code I must come up with some more hiding... Was this really nessesary?