I had another idea.

First we take a look at your code, then my proposal.

***********************************************************************************************************

***********************************************************************************************************

**FROM JACOBIAN TO AFFINE COORDINATES**Let's start from

static void hrdec_ge_set_all_gej_new(secp256k1_ge *r, const secp256k1_gej *a) Your code:

....

for (i = 1; i < 4096; i++) {

secp256k1_fe_mul(&az[i], &az[i - 1], &a[i].z);

}

secp256k1_fe_inv_var(&u, &az[--i]); // sets i to 4095 here <---- You perform only 1 inverse

while (--i) {

secp256k1_fe_mul(&az[i + 1], &az[i], &u);

secp256k1_fe_mul(&u, &u, &a[i + 1].z);

}

To avoid to compute 4096 inverse field elements, you are using the "Simultaneous inversion" algorithm

in this way you perform only 1 inverse + 3*4095M =

**1I + 12285M**. You trade each inverse for 3 multiplication.

My proposal: only 1,5 multiplication for each inverse --> 1 inverse + 1,5*4095M =

** 1I + 6144M ** (the details later)

Your code:

for (i = 0; i < 4096; i++) {

r[i].infinity = a[i].infinity;

secp256k1_ge_set_gej_zinv(&r[i], &a[i], &az[i]);

}

From secp256k1 (

https://github.com/bitcoin-core/secp256k1/blob/master/src/group_impl.h ):

static void secp256k1_ge_set_gej_zinv(secp256k1_ge *r, const secp256k1_gej *a, const secp256k1_fe *zi) {

75 secp256k1_fe zi2;

76 secp256k1_fe zi3;

77 secp256k1_fe_sqr(&zi2, zi); --> you can compute this only 2048 instead of 4096

78 secp256k1_fe_mul(&zi3, &zi2, zi) --> you can compute this only 2048 instead of 4096

79 secp256k1_fe_mul(&r->x, &a->x, &zi2);

80 secp256k1_fe_mul(&r->y, &a->y, &zi3);

81 r->infinity = a->infinity;

82 }

So you perform additional operations: (3M + 1S) * 4096 =

**11228M + 4096S** to pass from jacobian to affine coordinates

(X,Y,Z) --> (X/Z^2,Y/Z^3)

I think there is a way to perform only (3M + 1S)*2048 + 2M*2048 =

**10240M + 2048S****Total:** **1I + 23513M + 4096S** <->

**1I + 16384M + 2048S ** **(about -33%)**

***********************************************************************************************************

***********************************************************************************************************

Then let's look at the generation of uncompressed public keys:

Your code:

hrdec_mult_gen2(&batchpj[0], start);

for (i = 1; i < 4096; ++i) {

hrdec_gej_add_ge(&batchpj[i], &batchpj[i-1], &incr_a); // increment public key

}

.....

static void hrdec_mult_gen2(secp256k1_gej *r,

const uchar *seckey) {

secp256k1_gej o1;

secp256k1_gej s1, s2, s3, s4, s5, s6, s7, s8,

s9,s10,s11,s12,s13,s14,s15,s16;

secp256k1_gej_set_ge(&o1, &prec[ seckey[31]]);

secp256k1_gej_add_ge_var(&s1, &o1, &prec[ 256 + seckey[30]], NULL);

secp256k1_gej_set_ge(&o1, &prec[512 + seckey[29]]);

secp256k1_gej_add_ge_var(&s2, &o1, &prec[ 768 + seckey[28]], NULL);

secp256k1_gej_set_ge(&o1, &prec[1024 + seckey[27]]);

secp256k1_gej_add_ge_var(&s3, &o1, &prec[1280 + seckey[26]], NULL);

secp256k1_gej_set_ge(&o1, &prec[1536 + seckey[25]]);

secp256k1_gej_add_ge_var(&s4, &o1, &prec[1792 + seckey[24]], NULL);

secp256k1_gej_set_ge(&o1, &prec[2048 + seckey[23]]);

secp256k1_gej_add_ge_var(&s5, &o1, &prec[2304 + seckey[22]], NULL);

....

secp256k1_gej t1;

secp256k1_gej_add_var(&t1, &s1, &s2, NULL);

secp256k1_gej_add_var(&s1, &s3, &s4, NULL);

secp256k1_gej_add_var(&s2, &s5, &s6, NULL);

secp256k1_gej_add_var(&s3, &s7, &s8, NULL);

secp256k1_gej_add_var(&s4, &s9, &s10, NULL);

secp256k1_gej_add_var(&s5, &s11, &s12, NULL);

secp256k1_gej_add_var(&s6, &s13, &s14, NULL);

secp256k1_gej_add_var(&s7, &s15, &s16, NULL);

secp256k1_gej_add_var(&s8, &t1, &s1, NULL);

secp256k1_gej_add_var(&s1, &s2, &s3, NULL);

secp256k1_gej_add_var(&s2, &s4, &s5, NULL);

secp256k1_gej_add_var(&s3, &s6, &s7, NULL);

secp256k1_gej x1,x2;

secp256k1_gej_add_var(&x1, &s1, &s2, NULL);

secp256k1_gej_add_var(&x2, &s3, &s8, NULL);

secp256k1_gej_add_var(r, &x1, &x2, NULL);

}

From secp256k1 (

https://github.com/bitcoin-core/secp256k1/blob/master/src/group_impl.h )

static void secp256k1_gej_add_var(secp256k1_gej *r, const secp256k1_gej *a, const secp256k1_gej *b, secp256k1_fe *rzr) {

362 /* Operations: 12 mul, 4 sqr, 2 normalize, 12 mul_int/add/negate */

363 secp256k1_fe z22, z12, u1, u2, s1, s2, h, i, i2, h2, h3, t;

364

.....

So you are using the Point Addition (12M + 4S) (J+J --> J)

https://en.wikibooks.org/wiki/Cryptography/Prime_Curve/Jacobian_Coordinates#Point_Addition_.2812M_.2B_4S.29 instead of the more efficient : Mixed Addition (with affine point) (8M + 3S) (J+A --> J) (same link)

414 static void secp256k1_gej_add_ge_var(secp256k1_gej *r, const secp256k1_gej *a, const secp256k1_ge *b, secp256k1_fe *rzr) {

415 /* 8 mul, 3 sqr, 4 normalize, 12 mul_int/add/negate */

416 secp256k1_fe z12, u1, u2, s1, s2, h, i, i2, h2, h3, t;

Maybe there is a reason (if you compute kG, kG + G, (k+1)G + G, why G is not in affine coordinates?), I must admite I don't understand well these piece of code.

Do you use the same function (secp256k1_gej_add_var) in hrdec_gej_add_ge too?

for (i = 1; i < 4096; ++i) {

hrdec_gej_add_ge(&batchpj[i], &batchpj[i-1], &incr_a); // increment public key

}

In any case, I understand that you perform at least (8M +3S)*4096 op (or not?)

And it seems to me that you don't exploit at all the fact that the keys are consecutive (I'm sorry in advance if I'm wrong, I really can't read code in general, not only yours

)

***********************************************************************************

***********************************************************************************

My proposal: instead of generating k*G, k*G + G, (k+1)*G + G, ....

you could

1) precompute G, 2*G, 3*G, ...., 2048*G

2) then generate as first point of the batch: (k+2048)*G = k'*G (

~~jacobian coordinates -> (X1, Y1, Z1))~~ better (X1,Y1,1)

3) then generate the following couples:

k'*G+1*G , k'*G-1*G

k'*G+2*G, k'*G-2*G

.......................................

k'*G+m*G, k'*G-m*G

.......................................

k'*G+2048*G, k'*G-2048*G

In this way:

a) k'G is always equals to (X1,Y1,

~~Z1~~,1) (

~~jacobian~~ affine coordinates)

b) m*G = (X2,Y2,1) is always in affine coordinates!

c) k'*G - m*G = k'*G+(-m*G) and +m*G / -m*G have the same X2, and Y2 opposite

then you can use the same X2 in 2 operations (and the same inverse!)

**Mixed Addition (with affine point) in your case --> (**~~4M~~ 3M + 1,5S):

U1 = X1 (Z2=1)

U2 = X2*Z1^2 --> 1/2M (you have to compute Z1^2 only once in the batch, X2 is the same in +mG and -mG)

S1 = Y1

S2 = Y2*Z1^3 --> 1/2M (you have to compute Z1^3 only once in the batch, and remember that mG ->Y2, -mG -> -Y2)

H = U2 - U1 = X2*Z1^2 - X1 --> 0 M

R = S2 - S1 = Y2*Z1^3 - Y1 --> 0 M

X3 = R^2 - H^3 - 2*X1*H^2 --> (1 + 1/2)S + 2*1/2 M = 1,5S + 1M (H is the same each 2 addition)

Y3 = R*(X1*H^2 - X3) - Y1*H^3. --> 1M + 1/2M = 1,5M (Y1 is the same, H is the same each 2 addition )

Z3 = H*Z1 --> 1/2 M --> (Z3 is the same each 2 addition, then the inverse is the same too!)

return (X3, Y3, Z3)

Total:

~~4M~~ 3M + 1,5SWhat do you think about?