4) I'm guessing that primorial search refers to finding header hashes divisible by a primorial. The CPU implementation searches for hashes that are divisible by 7# (= 2 * 3 * 5 * 7). On the CPU this takes only a tiny fraction of time. If you have a fast GPU implementation of it, you can search for hashes divisible by much larger primorials. This might get him a minor speedup.

Note that his faster primorial search will be obsolete once mining protocol v0.2 is enforced. Link:

http://www.peercointalk.org/index.php?topic=453.0Yes, that was to be expected. Forcing the hash to be a prime number makes the proof-of-work reusability infeasible. Also, searching for a block header hash that is divisible by a large primorial no longer applies. However, this does not prevent someone who is mathematically gifted to go after double SHA-256 and look for a divisibility weakness against existing prime chain origins. That is what I would do if I were interested in the financial gains from mining.

For the (Nvidia GPU, sm_20 and above) modular exponentiation, I am using the square-and-multiply (binary exponentiation) method. You cannot eliminate branching but it can be drastically reduced. The penalty for branching in this case is a 2x reduction in performance, but you can fix the exponent to reduce the overall effects of warp divergence. You can see from the partial code below (Montgomery Reduction for multiplying two 320-bit numbers) that it is almost branch free. And a vast amount of work went into memory management instead.

So how do I fix the exponent, it is easy to do on a chain-by-chain basis, for example:

We have the following chain, 19, 37, 73

Take the largest exponent which is (73-1) = 72

2^72 modulo 19 = 1

2^72 modulo 37 = 1

2^72 modulo 73 = 1

I also have another trick that I run on the CPU from left-to-right on my virtual array to deal with the partial remainders, because the exponent is fixed for the entire array, is has to do with the modular square roots of prime numbers. It is much more complicated but is runs very fast on the CPU and almost always returns the correct result for prime numbers, but not for composite numbers.

Even with all of this work, the Nvidia GTX 580 is still only 6.86x faster than the AMD Phenom II X6 1100T because of the added overhead to reduce warp divergence. Plus I am also working in the Montgomery domain from end-to-end.

__device__ void

nvidia_gfn_multiply(nvidia_word_t *rop, const int rop_interleaved,

const nvidia_word_t *op1, const int op1_interleaved,

const nvidia_word_t *op2, const int op2_interleaved,

nvidia_word_t nvidia_gfn_n)

{

nvidia_gfn_t r;

nvidia_word_t q;

nvidia_word_t c0=0, c1;

nvidia_word_t tasm=0;

r[0]=0;

r[1]=0;

r[2]=0;

r[3]=0;

r[4]=0;

r[5]=0;

r[6]=0;

r[7]=0;

r[8]=0;

r[9]=0;

tasm=0;

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[0]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[0*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[0]) : "r"(op1[0*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[1]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[1*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[1]) : "r"(op1[1*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[2]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[2*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[2]) : "r"(op1[2*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[3]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[3*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[3]) : "r"(op1[3*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[4]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[4*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[4]) : "r"(op1[4*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[5]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[5*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[5]) : "r"(op1[5*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[6]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[6*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[6]) : "r"(op1[6*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[7]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[7*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[7]) : "r"(op1[7*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[8]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[8*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[8]) : "r"(op1[8*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(r[9]) : "r"(tasm));

asm( "madc.hi.u32 %0, %1, %2, 0;" : "=r"(tasm) : "r"(op1[9*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r[9]) : "r"(op1[9*op1_interleaved]), "r"(op2[0*op2_interleaved]));

asm( "addc.u32 %0, %0, 0;" : "+r"(tasm));

asm( "add.cc.u32 %0, %0, %1;" : "+r"(c0) : "r"(tasm) );

asm( "addc.u32 %0, 0, 0;" : "=r"(c1));

...

nvidia_word_t overflow;

asm ( "sub.cc.u32 %0, %0, %1;" : "+r"(r[0]) : "r"(nvidia_gfn_n[0]) );

asm ( "subc.cc.u32 %0, %0, %1;" : "+r"(r[1]) : "r"(nvidia_gfn_n[1]) );

asm ( "subc.cc.u32 %0, %0, %1;" : "+r"(r[2]) : "r"(nvidia_gfn_n[2]) );

asm ( "subc.cc.u32 %0, %0, %1;" : "+r"(r[3]) : "r"(nvidia_gfn_n[3]) );

asm ( "subc.cc.u32 %0, %0, %1;" : "+r"(r[4]) : "r"(nvidia_gfn_n[4]) );

asm ( "subc.cc.u32 %0, %0, %1;" : "+r"(r[5]) : "r"(nvidia_gfn_n[5]) );

asm ( "subc.cc.u32 %0, %0, %1;" : "+r"(r[6]) : "r"(nvidia_gfn_n[6]) );

asm ( "subc.cc.u32 %0, %0, %1;" : "+r"(r[7]) : "r"(nvidia_gfn_n[7]) );

asm ( "subc.cc.u32 %0, %0, %1;" : "+r"(r[8]) : "r"(nvidia_gfn_n[8]) );

asm ( "subc.cc.u32 %0, %0, %1;" : "+r"(r[9]) : "r"(nvidia_gfn_n[9]) );

asm ( "subc.u32 %0, %1, 0;" : "=r"(overflow) : "r"(c0) );

if (overflow!=0)

{

asm ( "add.cc.u32 %0, %0, %1;" : "+r"(r[0]) : "r"(nvidia_gfn_n[0]) );

asm ( "addc.cc.u32 %0, %0, %1;" : "+r"(r[1]) : "r"(nvidia_gfn_n[1]) );

asm ( "addc.cc.u32 %0, %0, %1;" : "+r"(r[2]) : "r"(nvidia_gfn_n[2]) );

asm ( "addc.cc.u32 %0, %0, %1;" : "+r"(r[3]) : "r"(nvidia_gfn_n[3]) );

asm ( "addc.cc.u32 %0, %0, %1;" : "+r"(r[4]) : "r"(nvidia_gfn_n[4]) );

asm ( "addc.cc.u32 %0, %0, %1;" : "+r"(r[5]) : "r"(nvidia_gfn_n[5]) );

asm ( "addc.cc.u32 %0, %0, %1;" : "+r"(r[6]) : "r"(nvidia_gfn_n[6]) );

asm ( "addc.cc.u32 %0, %0, %1;" : "+r"(r[7]) : "r"(nvidia_gfn_n[7]) );

asm ( "addc.cc.u32 %0, %0, %1;" : "+r"(r[8]) : "r"(nvidia_gfn_n[8]) );

asm ( "addc.u32 %0, %0, %1;" : "+r"(r[9]) : "r"(nvidia_gfn_n[9]));

}

rop[0*rop_interleaved]=r[0];

rop[1*rop_interleaved]=r[1];

rop[2*rop_interleaved]=r[2];

rop[3*rop_interleaved]=r[3];

rop[4*rop_interleaved]=r[4];

rop[5*rop_interleaved]=r[5];

rop[6*rop_interleaved]=r[6];

rop[7*rop_interleaved]=r[7];

rop[8*rop_interleaved]=r[8];

rop[9*rop_interleaved]=r[9];

}