restless
Legendary
Offline
Activity: 1151
Merit: 1001
|
|
March 09, 2014, 10:21:16 AM |
|
I put the original scanhash function from blakecoin cpuminer (block.c) into cudaminer and even then the shares were rejected with the same reason.
This leads me to believe that there may be subtle differences in the stratum implementation that I need to track down.
Christian
Sorry, I was wrong, messed conf file. Actually changing just cl file was not enough I think all the differences are in .cl code. I tried sgminer4.1 +6850 (version compiled 19-02), there is blake.cl in there, 9K big, author Thomas Pornin - all shares rejected with same error as above. Then downloaded the custom cgminer for blake, the file is blake256.cl - 6K big. Just putting blake256.cl into kernel folder and setting it as algo - bam, sgminer started sending shares which were accepted, so all the diff are in clCode of blake256.cl // BLAKE-256 hash algorithm in OpenCL, 8 rounds, second block for blakecoin
#ifdef cl_khr_byte_addressable_store #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable #endif
#ifdef VECTORS4 typedef uint4 uint32_t; #elif defined(VECTORS2) typedef uint2 uint32_t; #else typedef uint uint32_t; #endif
typedef unsigned char uint8_t;
#define SWAP32_V(n) \ (((n) << 24) | (((n) & 0xff00) << 8) | \ (((n) >> 8) & 0xff00) | ((n) >> 24))
typedef struct { uint32_t h[8]; uint t; } state256;
#define NB_ROUNDS32 8
constant uint8_t sigma[16][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } };
constant uint u256[16] = { 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c, 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 };
constant uint8_t steps[8][5] = { /* column step */ { 0, 4, 8, 12, 0 }, { 1, 5, 9, 13, 2 }, { 2, 6, 10, 14, 4 }, { 3, 7, 11, 15, 6 }, /* diagonal step */ { 0, 5, 10, 15, 8 }, { 1, 6, 11, 12, 10 }, { 2, 7, 8, 13, 12 }, { 3, 4, 9, 14, 14 } };
//#define ROT32(x,n) (((x)<<(32-n))|( (x)>>(n))) #define ROT32(x,n) (rotate((uint32_t)x, (uint32_t)32-n)) #define ADD32(x,y) ((uint32_t)((x) + (y))) #define XOR32(x,y) ((uint32_t)((x) ^ (y)))
#define G(a,b,c,d,i) \ do {\ v[a] += XOR32(m[sigma[r][i]], u256[sigma[r][i+1]]) + v[b];\ v[d] = ROT32(XOR32(v[d],v[a]),16);\ v[c] += v[d];\ v[b] = ROT32(XOR32(v[b],v[c]),12);\ v[a] += XOR32(m[sigma[r][i+1]], u256[sigma[r][i]]) + v[b]; \ v[d] = ROT32(XOR32(v[d],v[a]), 8);\ v[c] += v[d];\ v[b] = ROT32(XOR32(v[b],v[c]), 7);\ } while (0)
// compress a block void blake256_compress_block( private state256 *S, private uint32_t *m) { private uint32_t v[16]; #pragma unroll 8 for( int i = 0; i < 8; ++i ) { v[i] = S->h[i]; v[i+8] = u256[i]; };
v[12] ^= S->t; v[13] ^= S->t;
#pragma unroll 7 for(int r = 0; r < 7; r++ ) { #pragma unroll 8 for(int j = 0; j < 8; j++) G( steps[j][0], steps[j][1], steps[j][2], steps[j][3], steps[j][4] ); /* // column step G( 0, 4, 8, 12, 0 ); G( 1, 5, 9, 13, 2 ); G( 2, 6, 10, 14, 4 ); G( 3, 7, 11, 15, 6 ); // diagonal step G( 0, 5, 10, 15, 8 ); G( 1, 6, 11, 12, 10 ); G( 2, 7, 8, 13, 12 ); G( 3, 4, 9, 14, 14 ); */ } // not need last round last step int r = 7; #pragma unroll 7 for(int j = 0; j < 7; j++) G( steps[j][0], steps[j][1], steps[j][2], steps[j][3], steps[j][4] );
S->h[7] ^= v[7] ^ v[15]; }
#define FOUND (0xFF) #define SETFOUND(Xnonce) output[output[FOUND]++] = Xnonce
#ifndef WORKSIZE #define WORKSIZE 64 #endif
__attribute__((vec_type_hint(uint32_t))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search( volatile __global uint * restrict output, // precalc hash from fisrt part of message const uint h0, const uint h1, const uint h2, const uint h3, const uint h4, const uint h5, const uint h6, const uint h7, // last 12 bytes of original message const uint in16, const uint in17, const uint in18 ) {
private state256 S; S.h[0] = h0; S.h[1] = h1; S.h[2] = h2; S.h[3] = h3; S.h[4] = h4; S.h[5] = h5; S.h[6] = h6; S.h[7] = h7;
#if defined(VECTORS4) uint32_t gx = get_global_id(0); const uint gs = get_global_size(0); gx.y += gs; gx.z += gs*2; gx.w += gs*3; #elif defined(VECTORS2) uint32_t gx = get_global_id(0); gx.y += get_global_size(0); #else const uint32_t gx = get_global_id(0); #endif private uint32_t m[16]; S.t = 640; m[0] = in16; m[1] = in17; m[2] = in18; m[3] = SWAP32_V(gx); m[4] = 0x80000000; #pragma unroll 8 for (int i = 5;i<13;i++) m[i] = 0; m[13] = 1; m[14] = 0; m[15] = 640; blake256_compress_block( &S, &m );
#if defined(VECTORS4) bool result = any(S.h[7] == 0);
if (result) { if (S.h[7].x == 0) SETFOUND(gx.x); if (S.h[7].y == 0) SETFOUND(gx.y); if (S.h[7].z == 0) SETFOUND(gx.z); if (S.h[7].w == 0) SETFOUND(gx.w); } #elif defined(VECTORS2) bool result = any(S.h[7] == 0);
if (result) { if (S.h[7].x == 0) SETFOUND(gx.x); if (S.h[7].y == 0) SETFOUND(gx.y); } #else // if (S.h[7] <= 0x000000FF) // from 0 to 255 low difficulty shares and above, maybe need to swap h[7] before, do not remember if (S.h[7] == 0) SETFOUND(gx); #endif }
|