Bitcoin Forum
May 05, 2024, 02:01:49 PM *
News: Latest Bitcoin Core release: 27.0 [Torrent]
 
   Home   Help Search Login Register More  
Pages: « 1 [2] 3 »  All
  Print  
Author Topic: secp256k1 library in pure assembly  (Read 1054 times)
ecdsa123 (OP)
Full Member
***
Offline Offline

Activity: 211
Merit: 105

Dr WHO on disney+


View Profile
December 03, 2022, 07:33:02 PM
Last edit: December 03, 2022, 09:10:10 PM by ecdsa123
 #21

why you will not use xmm and xmmword? it is 128 bit? less code, less operation, fastest

when I finish i will upload.

the problem is with modulo for my self:)


try something like:

Code:
    %define arg1f XMM0
    %define arg2f XMM1
    %define arg3f XMM2
    %define arg4f XMM3
    %define arg1 RDI
    %define arg2 RSI
    %define arg3 RDX
    %define arg4 RC
    %define arg5 R8
    %define arg6 R9


%macro MULT 1,2,3
;Multi XMM0:XMM1:XMM4:XMM5 by XMM2:XMM3:XMM6:XMM7
        movups arg3f,[%1+%3]
        movaps xmm7,arg3f

        mulps  arg3f,arg1f
        movshdup    arg4f, arg3f
        addps       arg3f, arg4f
        movaps xmm6,xmm7
        movhlps     arg4f, arg3f
        addss       arg3f, arg4f
        movss  [%2+%3], arg3f;

        mulps  xmm6,arg2f
        movshdup    arg4f, xmm6
        addps       xmm6, arg4f
        movaps arg3f,xmm7
        movhlps     arg4f, xmm6
        addss       xmm6, arg4f
        movss  [%2+4+%3], xmm6

        mulps  arg3f,xmm4
        movshdup    arg4f, arg3f
        addps       arg3f, arg4f
        movaps xmm6,xmm7
        movhlps     arg4f, arg3f
        addss       arg3f, arg4f
        movss  [%2+8+%3], arg3f

        mulps  xmm6,xmm5
        movshdup    arg4f, xmm6
        addps       xmm6, arg4f
        movhlps     arg4f, xmm6
        addss       xmm6, arg4f
        movss  [%2+8+4+%3], xmm6

%endmacro

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions
1714917709
Hero Member
*
Offline Offline

Posts: 1714917709

View Profile Personal Message (Offline)

Ignore
1714917709
Reply with quote  #2

1714917709
Report to moderator
1714917709
Hero Member
*
Offline Offline

Posts: 1714917709

View Profile Personal Message (Offline)

Ignore
1714917709
Reply with quote  #2

1714917709
Report to moderator
Advertised sites are not endorsed by the Bitcoin Forum. They may be unsafe, untrustworthy, or illegal in your jurisdiction.
1714917709
Hero Member
*
Offline Offline

Posts: 1714917709

View Profile Personal Message (Offline)

Ignore
1714917709
Reply with quote  #2

1714917709
Report to moderator
1714917709
Hero Member
*
Offline Offline

Posts: 1714917709

View Profile Personal Message (Offline)

Ignore
1714917709
Reply with quote  #2

1714917709
Report to moderator
1714917709
Hero Member
*
Offline Offline

Posts: 1714917709

View Profile Personal Message (Offline)

Ignore
1714917709
Reply with quote  #2

1714917709
Report to moderator
NotATether
Legendary
*
Offline Offline

Activity: 1596
Merit: 6728


bitcoincleanup.com / bitmixlist.org


View Profile WWW
December 04, 2022, 04:57:42 PM
 #22

why you will not use xmm and xmmword? it is 128 bit? less code, less operation, fastest

when I finish i will upload.

the problem is with modulo for my self:)


try something like:

Code:
    %define arg1f XMM0
    %define arg2f XMM1
    %define arg3f XMM2
    %define arg4f XMM3
    %define arg1 RDI
    %define arg2 RSI
    %define arg3 RDX
    %define arg4 RC
    %define arg5 R8
    %define arg6 R9


%macro MULT 1,2,3
;Multi XMM0:XMM1:XMM4:XMM5 by XMM2:XMM3:XMM6:XMM7
        movups arg3f,[%1+%3]
        movaps xmm7,arg3f

        mulps  arg3f,arg1f
        movshdup    arg4f, arg3f
        addps       arg3f, arg4f
        movaps xmm6,xmm7
        movhlps     arg4f, arg3f
        addss       arg3f, arg4f
        movss  [%2+%3], arg3f;

        mulps  xmm6,arg2f
        movshdup    arg4f, xmm6
        addps       xmm6, arg4f
        movaps arg3f,xmm7
        movhlps     arg4f, xmm6
        addss       xmm6, arg4f
        movss  [%2+4+%3], xmm6

        mulps  arg3f,xmm4
        movshdup    arg4f, arg3f
        addps       arg3f, arg4f
        movaps xmm6,xmm7
        movhlps     arg4f, arg3f
        addss       arg3f, arg4f
        movss  [%2+8+%3], arg3f

        mulps  xmm6,xmm5
        movshdup    arg4f, xmm6
        addps       xmm6, arg4f
        movhlps     arg4f, xmm6
        addss       xmm6, arg4f
        movss  [%2+8+4+%3], xmm6

%endmacro

Are you saving some bits at the end of each dword for the carry? Otherwise you're going to lose accuracy in the final answer, because SIMD stuff will just overflow instead of carry.

I was thinking of applying the excellent technique of 5x52-bit numbers used in libsecp256k1, where we use 5 quadwords that each hold 52 bits of the real number each (except for the most significant qword which holds less). There's also a 10x26-bit variant for 32-bit machines.

If that technique is applied, we can defer the carry and add the numbers hundreds of times without corruption.

We would only need 3 SSE adds, or 2 AVX adds in that case. But I have heard that using the AVX instructions incurs some kind of speed penalty like AVX-512?

An alternative is just stuffing them in the R8-R15 registers and RAX/RBX for the 5 quadwords for both operands and then clobber one of the operands with the sum.

.
.BLACKJACK ♠ FUN.
█████████
██████████████
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
████████████████
░██████████████
████████████
███████████████░██
██████████
CRYPTO CASINO &
SPORTS BETTING
▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
█████████
.
j2002ba2
Full Member
***
Offline Offline

Activity: 204
Merit: 437


View Profile
December 05, 2022, 05:48:07 PM
 #23

Ok, lets to to write it by parts using openAI

Code:
...
    addq num2(,%rax,8), %rdx
...
As usual, the so called AI gave wrong result. The carry flag is completely ignored, so instead it adds two vectors, each having four 64bit numbers.
NotATether
Legendary
*
Offline Offline

Activity: 1596
Merit: 6728


bitcoincleanup.com / bitmixlist.org


View Profile WWW
December 06, 2022, 03:39:53 AM
 #24

Ok, lets to to write it by parts using openAI

Code:
...
    addq num2(,%rax,8), %rdx
...
As usual, the so called AI gave wrong result. The carry flag is completely ignored, so instead it adds two vectors, each having four 64bit numbers.

This is a recurring pattern I've seen in the AI-generated results. Nearly all of them have comments, so I know that the ones that don't talk about the point additions expressions are wrong, and there are also samples where I saw arithmetic using just "eax" instead of the necessary 4/5 64-bit registers.

There were instances where I got merely 256-bit number addition instead of point addition.

.
.BLACKJACK ♠ FUN.
█████████
██████████████
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
████████████████
░██████████████
████████████
███████████████░██
██████████
CRYPTO CASINO &
SPORTS BETTING
▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
█████████
.
ecdsa123 (OP)
Full Member
***
Offline Offline

Activity: 211
Merit: 105

Dr WHO on disney+


View Profile
December 21, 2022, 07:42:25 PM
 #25

I have done implement parts of secp256k1 in pure asm

I would like to inform that perofrmence is fucking fast (for me):

100 000 000 performs modulo n on secp256k1 vals:

ubuntu@ubuntu2004:~/Downloads/ma$ nasm -f elf64 main.asm
ubuntu@ubuntu2004:~/Downloads/ma$ ld -s -o main main.o
ubuntu@ubuntu2004:~/Downloads/ma$ time ./main

real   0m4.856s
user   0m4.850s
sys   0m0.005s
ubuntu@ubuntu2004:~/Downloads/ma$

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions
albert0bsd
Hero Member
*****
Offline Offline

Activity: 853
Merit: 662



View Profile WWW
December 22, 2022, 02:30:38 AM
Merited by PowerGlove (1)
 #26

I have done implement parts of secp256k1 in pure asm

I would like to inform that perofrmence is fucking fast (for me):

100 000 000 performs modulo n on secp256k1 vals:

Can you explain what kind of numbers are those 100 Million inputs. What is your hardware and against what other implementation are you comparing your results?

NotATether
Legendary
*
Offline Offline

Activity: 1596
Merit: 6728


bitcoincleanup.com / bitmixlist.org


View Profile WWW
December 22, 2022, 09:51:35 AM
 #27

And moreover VanitySeacrh is only fast at "Point pub = secp256k1->ComputePublicKey(&privKey);" since it generates a "Point GTable[256*32];"
to be used in "Q = Add2(Q, GTable[256 * i + (b-1)])" having Jacobian Coordinates representation as calculation scheme.
If you take Add2 to add one million points in a sequence it is faster than classic point_addition with inversion by far.
But to use any Jacobian point after calculation further you need to use "Q.Reduce();" beforehand the same thing as inversion.
And in that situation classic scheme using GMP becomes faster since you just add point and can use it further immediately without any Reduce().

That's a sort of tradeoff you'll have when using data structures which can hold additional overflow at the cost of requiring a "rebuild" later. But most applications that do many hetrogeneous arithmetic ops should look into writing a function to sort of do it all at once with some optimization if possible. Just so that you can retain the benefit of using overflow structures.

.
.BLACKJACK ♠ FUN.
█████████
██████████████
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
████████████████
░██████████████
████████████
███████████████░██
██████████
CRYPTO CASINO &
SPORTS BETTING
▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
█████████
.
NotATether
Legendary
*
Offline Offline

Activity: 1596
Merit: 6728


bitcoincleanup.com / bitmixlist.org


View Profile WWW
December 22, 2022, 12:59:05 PM
 #28

And moreover VanitySeacrh is only fast at "Point pub = secp256k1->ComputePublicKey(&privKey);" since it generates a "Point GTable[256*32];"
to be used in "Q = Add2(Q, GTable[256 * i + (b-1)])" having Jacobian Coordinates representation as calculation scheme.
If you take Add2 to add one million points in a sequence it is faster than classic point_addition with inversion by far.
But to use any Jacobian point after calculation further you need to use "Q.Reduce();" beforehand the same thing as inversion.
And in that situation classic scheme using GMP becomes faster since you just add point and can use it further immediately without any Reduce().

That's a sort of tradeoff you'll have when using data structures which can hold additional overflow at the cost of requiring a "rebuild" later. But most applications that do many hetrogeneous arithmetic ops should look into writing a function to sort of do it all at once with some optimization if possible. Just so that you can retain the benefit of using overflow structures.


Different projective points representations boost scalar multiplication only but not points addition.
That is all very interesting. But can you point to some source code to see that in action.
In case with GMP performance boost comes when you put local variables of functions to global scope.
So that you initialize them at the start and clear in the end. And functions use them directly for reading and writing throughout the program run.


Libsecp256k1 that is used in Bitcoin Core has two special-purpose structs for adding & multiplying a point many times before requiring a rebuild. https://github.com/bitcoin-core/secp256k1/blob/master/src/field_5x52.h and https://github.com/bitcoin-core/secp256k1/blob/master/src/field_10x26.h

These are special because there is no intermediate "modulus" during any of the operations, so they are quite fast. Unfortunately these functions are only correct for public keys (as the modulus is hardcoded), and you'd need to write a different set of functions to operate with private keys.

What I'm thinking is: Imagine if you need to perform the following (arbitrary) sequence:

E = a*G + b*G + c*G + d*G

I am multiplying 4 times and adding three times.

Now we could do (a+b+c+d)*G, and since these functions can last for a number of ops without a rebuild, I should be safe doing a Montgomery ladder for the multiplication by G, with this sum. Instead of multiplying 4 times I only multiply one time.

Now let's look at something I just pasted from JeanLucPons Kangaroo:

Code:
      dy.ModSub(p2y,p1y);
      _s.ModMulK1(&dy,&dx[g]);
      _p.ModSquareK1(&_s);

      rx.ModSub(&_p,p1x);
      rx.ModSub(p2x);

      ry.ModSub(p2x,&rx);
      ry.ModMulK1(&_s);
      ry.ModSub(p2y);

It roughly translates to (((p2y-p1y) * dx)^2 - p1x - p2x) = rx
and (p2x - (((p2y-p1y) * dx)^2 - p1x - p2x)) * ((p2y-p1y) * dx) - p2y = ry

That's 6 subtractions, 3 multiplications, and one squaring in total.

What if there were a way to optimize this calculation through bit shifts and the like, just like how libsecp256k1 multiplies two numbers with not only muls and adds but also bitwise arithmetic?

We have a pattern: (a - b) * c which occurs twice in this calculation in total.

Rather than consuming a subtraction and multiplication, I wonder if there was a way to calculate them at once (not just concatenate them together).

That would be incredible.

.
.BLACKJACK ♠ FUN.
█████████
██████████████
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
████████████████
░██████████████
████████████
███████████████░██
██████████
CRYPTO CASINO &
SPORTS BETTING
▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
█████████
.
arulbero
Legendary
*
Offline Offline

Activity: 1915
Merit: 2074


View Profile
December 22, 2022, 05:33:57 PM
Last edit: December 22, 2022, 05:45:19 PM by arulbero
 #29


Now let's look at something I just pasted from JeanLucPons Kangaroo:

Code:
      dy.ModSub(p2y,p1y);
      _s.ModMulK1(&dy,&dx[g]);
      _p.ModSquareK1(&_s);

      rx.ModSub(&_p,p1x);
      rx.ModSub(p2x);

      ry.ModSub(p2x,&rx);
      ry.ModMulK1(&_s);
      ry.ModSub(p2y);

It roughly translates to (((p2y-p1y) * dx)^2 - p1x - p2x) = rx
and (p2x - (((p2y-p1y) * dx)^2 - p1x - p2x)) * ((p2y-p1y) * dx) - p2y = ry

That's 6 subtractions, 3 multiplications, and one squaring in total.

What if there were a way to optimize this calculation through bit shifts and the like, just like how libsecp256k1 multiplies two numbers with not only muls and adds but also bitwise arithmetic?

We have a pattern: (a - b) * c which occurs twice in this calculation in total.

Rather than consuming a subtraction and multiplication, I wonder if there was a way to calculate them at once (not just concatenate them together).


The point addition formula is:

Code:
Point Addition for X₁ ≠ X₂             #


            s = (y2 - y1) / (x2 - x1)
            x3 = s ** 2 - x1 - x2
            y3 = s * (x1 - x3) - y1

If you have to generate a batch of public keys with same distance, for example B =  10*G, 15*G, 20*G, 25*G ...

you can generate only x coordinates (y coordinates are not necesssary) and compute only 1 inversion for the entire batch B; you don't need Montgomery ladder, projective coordinates, 5x52 limbs, ...

To generate 1000 public keys you need to compute:

1000 squares   -->  s**2
1000 multiplications   (y2-y1) * 1/(x2-x1)
1 inversion
1500 multiplications --> to get 1/(x2-x1)

tot:   2500 mult + 1000 sqr + 1 inv to compute 1000 public keys ->  2.5 mul + 1 sqr + 4 subtractions for each key.
ecdsa123 (OP)
Full Member
***
Offline Offline

Activity: 211
Merit: 105

Dr WHO on disney+


View Profile
December 22, 2022, 05:39:17 PM
 #30

I have done implement parts of secp256k1 in pure asm

I would like to inform that perofrmence is fucking fast (for me):

100 000 000 performs modulo n on secp256k1 vals:

Can you explain what kind of numbers are those 100 Million inputs. What is your hardware and against what other implementation are you comparing your results?


so: test was performed on:
Intel Core i5-1240P  - without multithreading - on just one core.
Power set up on balance (not maximum performance)

test has been done on:

xor rcx,rcx
_loop:
x*x mod n
so first multiply x by x ( x is value Point of generator Secp256k1 ( i mean the x value of G(x,y)) then mod n
so 256 bit *256 bit value then mod n

inc rcx
cmp rcx,100 000 000
jne _loop
 

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions
ecdsa123 (OP)
Full Member
***
Offline Offline

Activity: 211
Merit: 105

Dr WHO on disney+


View Profile
December 22, 2022, 07:20:42 PM
 #31


Now let's look at something I just pasted from JeanLucPons Kangaroo:

Code:
      dy.ModSub(p2y,p1y);
      _s.ModMulK1(&dy,&dx[g]);
      _p.ModSquareK1(&_s);

      rx.ModSub(&_p,p1x);
      rx.ModSub(p2x);

      ry.ModSub(p2x,&rx);
      ry.ModMulK1(&_s);
      ry.ModSub(p2y);

It roughly translates to (((p2y-p1y) * dx)^2 - p1x - p2x) = rx
and (p2x - (((p2y-p1y) * dx)^2 - p1x - p2x)) * ((p2y-p1y) * dx) - p2y = ry

That's 6 subtractions, 3 multiplications, and one squaring in total.



What if there were a way to optimize this calculation through bit shifts and the like, just like how libsecp256k1 multiplies two numbers with not only muls and adds but also bitwise arithmetic?

We have a pattern: (a - b) * c which occurs twice in this calculation in total.

Rather than consuming a subtraction and multiplication, I wonder if there was a way to calculate them at once (not just concatenate them together).


The point addition formula is:

Code:
Point Addition for X₁ ≠ X₂             #


            s = (y2 - y1) / (x2 - x1)
            x3 = s ** 2 - x1 - x2
            y3 = s * (x1 - x3) - y1

If you have to generate a batch of public keys with same distance, for example B =  10*G, 15*G, 20*G, 25*G ...

you can generate only x coordinates (y coordinates are not necesssary) and compute only 1 inversion for the entire batch B; you don't need Montgomery ladder, projective coordinates, 5x52 limbs, ...

To generate 1000 public keys you need to compute:

1000 squares   -->  s**2
1000 multiplications   (y2-y1) * 1/(x2-x1)
1 inversion
1500 multiplications --> to get 1/(x2-x1)

tot:   2500 mult + 1000 sqr + 1 inv to compute 1000 public keys ->  2.5 mul + 1 sqr + 4 subtractions for each key.

this is intresting

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions
NotATether
Legendary
*
Offline Offline

Activity: 1596
Merit: 6728


bitcoincleanup.com / bitmixlist.org


View Profile WWW
December 23, 2022, 04:05:06 AM
 #32

The point addition formula is:

Code:
Point Addition for X_1 != X_2             #


            s = (y2 - y1) / (x2 - x1)
            x3 = s ** 2 - x1 - x2
            y3 = s * (x1 - x3) - y1

If you have to generate a batch of public keys with same distance, for example B =  10*G, 15*G, 20*G, 25*G ...

you can generate only x coordinates (y coordinates are not necesssary) and compute only 1 inversion for the entire batch B; you don't need Montgomery ladder, projective coordinates, 5x52 limbs, ...

To generate 1000 public keys you need to compute:

1000 squares   -->  s**2
1000 multiplications   (y2-y1) * 1/(x2-x1)
1 inversion
1500 multiplications --> to get 1/(x2-x1)

tot:   2500 mult + 1000 sqr + 1 inv to compute 1000 public keys ->  2.5 mul + 1 sqr + 4 subtractions for each key.

It is not clear to me to what this 1 inversion would be applied to. At a first glance, I only see the (x2 - x1) being inverted, but it's not clear how you'd do the inversion last, or only once, because x1 && x2 will usually not both be constant.

.
.BLACKJACK ♠ FUN.
█████████
██████████████
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
████████████████
░██████████████
████████████
███████████████░██
██████████
CRYPTO CASINO &
SPORTS BETTING
▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
█████████
.
arulbero
Legendary
*
Offline Offline

Activity: 1915
Merit: 2074


View Profile
December 23, 2022, 07:56:04 AM
Merited by NotATether (5)
 #33

It is not clear to me to what this 1 inversion would be applied to. At a first glance, I only see the (x2 - x1) being inverted, but it's not clear how you'd do the inversion last, or only once, because x1 && x2 will usually not both be constant.

Of course you have to get many different inversions, 1/(x2-x1), 1/(x2'-x1') and so on, but you can get these values by computing only one inversion for the entire group.

The idea is:

imagine you need 1/a and 1/b

you don't want to compute 2 inversions, because inversion is a very expensive operation.

Then:

1) compute a*b        -> 1 multiplication
2) compute 1/(a*b)   -> 1 inversion, you invert only a*b
3) compute 1((a*b) * b = 1/a  -> 1 multiplication
4) compute 1/(a*b) * a = 1/b  -> 1 multiplication

now you got 2 inversions (1/a and 1/b) computing only 3 multiplications and 1 'real' inversion.

If you have more than 2 elements, for example a,b,c,d,e

1) compute a*b                  -> 1 mul  (you have to store this result)
2) compute (a*b)*c            -> 1 mul  (you have to store this result)
3) compute (a*b*c)*d        -> 1 mul  (you have to store this result)
4) compute (a*b*c*d)*e    ->  1mul   (you have to store this result)
5) compute 1/(a*b*c*d*e) -> 1 inv

--> about 1 mul for each element + 1 inversion  -> (n-1)mul + 1 inv


6) compute 1/(a*b*c*d*e) * (a*b*c*d) = 1/e  -> 1 mul
7) compute 1/(a*b*c*d*e) * e = 1/(a*b*c*d)  -> 1 mul

8 ) compute 1/(a*b*c*d) * (a*b*c)  = 1/d  -> 1 mul
9) compute 1/(a*b*c*d) * d = 1/(a*b*c)  -> 1 mul

10) compute 1/(a*b*c) * (a*b)  = 1/c  -> 1 mul
11) compute 1/(a*b*c) * c = 1/(a*b)   -> 1 mul

12) compute 1/(a*b) * (a)  = 1/b  -> 1 mul
13) compute 1/(a*b) * (b) = 1/a   -> 1 mul

->   2*(n-2) + 2 = 2*(n-1) mul


If you do the same with a very long batch,

you need then 3*(n-1) mul + only 1 real inversion, in this way you minimize the impact of the real inversion.

On average you can consider then 3 mul for each element to invert.

-------------------------------------------------------------------------------------------------------------------------------------------------------


The function that implements this idea in Vanitysearch is 'ModInv' :

https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.cpp/#L35

first n-1 mul -> https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.cpp/#L42-L44

the only one real inversion -> https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.cpp/#L48

second and third n-1 mul -> https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.cpp/#L50-L54


that function belongs to the class IntGroup:

https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.h#L24


This trick works only if you know many 'x1' and 'x2' before starting the computation.
ecdsa123 (OP)
Full Member
***
Offline Offline

Activity: 211
Merit: 105

Dr WHO on disney+


View Profile
December 24, 2022, 09:59:45 AM
 #34


intresting things:

I'm plying in pure asm:


during multiply x by x , but combination of size of registers I got:

x is our X of Generator point.

Code:
SECTION .text
GLOBAL _start

_start:

call def_Point_Addition

    ; Terminate program
mov rax,1            ; 'exit' system call
mov rbx,0            ; exit with error code 0
int 80h


def_Point_Addition:           ;Point Addition for P1= P2   (tangent with slope)
mov512 num_1_512,x_256
mov512 num_2_512,x_256
mov rdi,num_1_512
mov rsi,num_2_512
call mul_u256s

call print_u512     ; print n1 after addition
call print_nl
ret

I get output:


buntu@ubuntu2004:/mnt/hgfs/plik/asm$ nasm -f elf64 proba.asm
ubuntu@ubuntu2004:/mnt/hgfs/plik/asm$ ld -s -o proba proba.o
ubuntu@ubuntu2004:/mnt/hgfs/plik/asm$ time ./proba
483ada7726a3c4655da4fbfc0e1108a8fd17b448a68554199c47d08ffb10d4b85dbbf79179d08fe 08f9a320029fccfe5423baf1bc4ca9debda56d7f7609edd60

real   0m0.004s
user   0m0.001s
sys   0m0.001s
ubuntu@ubuntu2004:/mnt/hgfs/plik/asm$

the first of 64 bytes are y of x!

I do not understand:)



Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions
arulbero
Legendary
*
Offline Offline

Activity: 1915
Merit: 2074


View Profile
December 24, 2022, 11:23:28 AM
 #35

I have done implement parts of secp256k1 in pure asm

I would like to inform that performance is fucking fast (for me):

100 000 000 performs modulo n on secp256k1 vals:

ubuntu@ubuntu2004:~/Downloads/ma$ nasm -f elf64 main.asm
ubuntu@ubuntu2004:~/Downloads/ma$ ld -s -o main main.o
ubuntu@ubuntu2004:~/Downloads/ma$ time ./main

real   0m4.856s
user   0m4.850s
sys   0m0.005s
ubuntu@ubuntu2004:~/Downloads/ma$


Can you explain what kind of numbers are those 100 Million inputs. What is your hardware and against what other implementation are you comparing your results?


so: test was performed on:
Intel Core i5-1240P  - without multithreading - on just one core.
Power set up on balance (not maximum performance)

test has been done on:

xor rcx,rcx
_loop:
x*x mod n
so first multiply x by x ( x is value Point of generator Secp256k1 ( i mean the x value of G(x,y)) then mod n
so 256 bit *256 bit value then mod n

inc rcx
cmp rcx,100 000 000
jne _loop
 


I tried a similar test with my mul function:

Code:
int main(){

 uint64_t  x[4] =  {0x59f2815b16f81798, 0x029bfcdb2dce28d9, 0x55a06295ce870b07, 0x79be667ef9dcbbac};
 uint64_t  a[4] =  {0x59f2815b16f81798, 0x029bfcdb2dce28d9, 0x55a06295ce870b07, 0x79be667ef9dcbbac};


 uint64_t* ptrx = &x[0];
 uint64_t* ptra = &a[0];

 uint32_t i;

 for(i=0; i < 100000000; i++){
     mul(ptrx,ptra,ptrx);  //a*x -> x
 }
 

}


12th Gen Intel(R) Core(TM) i7-12700H  - without multithreading - on just one core.
Power set up on balance (not maximum performance)

$ gcc   -O2  main.c

$ time ./a.out

real   0m0,886s
user   0m0,882s
sys   0m0,005s
ecdsa123 (OP)
Full Member
***
Offline Offline

Activity: 211
Merit: 105

Dr WHO on disney+


View Profile
December 24, 2022, 12:10:55 PM
 #36

could you add mod n after mult?

i'm intresting to see the benchmark.
in your example we have only multiply without mod n

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions
arulbero
Legendary
*
Offline Offline

Activity: 1915
Merit: 2074


View Profile
December 24, 2022, 12:34:40 PM
Last edit: December 24, 2022, 01:29:53 PM by arulbero
Merited by vapourminer (1), n0nce (1), citb0in (1)
 #37

could you add mod n after mult?

i'm intresting to see the benchmark.
in your example we have only multiply without mod n

My mul function has mod too.

Code:
// compute a * b = (low. high)
#define MultiplyWordsLoHi(low, high, a, b) asm  ( "mulx  %2, %0, %1;" : "=r"(low), "=r"(high) :  "gr" (a), "d" (b) : "cc");

#define AccAdd4WordsBy4_wc(a0, a1, a2, a3, b0, b1, b2)   asm  ("addq %4, %0; adcx %5, %1; adcx %6, %2; adcq $0, %3;" : "+r"(a0), "+r"(a1), "+r"(a2), "+r"(a3) : "r"(b0), "r"(b1), "r"(b2) : "cc");
// (a0, a1, a2, a3) = (a0, a1, a2, a3) + (b0, b1, b2, 0) without carry

#define MulAcc(c, a0, a1, a, b) asm      ("mulx %3, %3, %4; addq %3, %1; adcq %4, %2; adcq $0, %0;" : "+r"(c), "+r"(a0), "+r"(a1), "=a"(a), "=d"(b) : "a"(a), "d"(b) : "cc");

#define MulAcc_11(a0, a1, c0, a, b)      asm ("mulx %3, %0, %1; addq %2, %0; adcq $0, %1;" : "+r"(a0), "+r"(a1): "r"(c0), "r"(a), "d"(b) : "cc");



// compute u*v mod p
inline void
mul(uint64_t *r, uint64_t *u, uint64_t *v) {
  
  uint64_t u0 = u[0];
  uint64_t u1 = u[1];
  uint64_t u2 = u[2];
  uint64_t u3 = u[3];


  uint64_t v0 = v[0];
  uint64_t v1 = v[1];
  uint64_t v2 = v[2];
  uint64_t v3 = v[3];

  uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
  uint64_t z1, z2, z3, z4, z5, z6, z7, z8, z44, z66;

  z2 = z3 = z4 = z5 = z6 = z7 = z8 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0;


  MultiplyWordsLoHi(r0, z1, u0, v0) //x1 --> r0 ok
  MultiplyWordsLoHi(z2, z3, u1, v0)
  MultiplyWordsLoHi(z4, z5, u2, v0)
  MultiplyWordsLoHi(z6, z7, u3, v0)
  AccAdd4WordsBy4_wc(z2, z4, z6, z7, z1, z3, z5)


  MulAcc_11(r1, z1, z2, u0, v1) //x1 --> r1 ok
  MultiplyWordsLoHi(z2, z3, u1, v1)
  MultiplyWordsLoHi(z44, z5, u2, v1)
  MultiplyWordsLoHi(z66, z8, u3, v1)
  AccAdd4WordsBy4_wc(z1, z3, z5, z8, z4, z6, z7)
  AccAdd4WordsBy4_wc(z2, z44, z66, z8, z1, z3, z5)

  
  MulAcc_11(r2, z1, z2, u0, v2) //x1 --> r2 ok
  MultiplyWordsLoHi(z2, z3, u1, v2)
  MultiplyWordsLoHi(z4, z5, u2, v2)
  MultiplyWordsLoHi(z6, z7, u3, v2)
  AccAdd4WordsBy4_wc(z1, z3, z5, z7, z44, z66, z8)
  AccAdd4WordsBy4_wc(z2, z4, z6, z7, z1, z3, z5)

  MulAcc_11(r3, z1, z2, u0, v3) //x1 --> r3 ok
  MultiplyWordsLoHi(r4, z3, u1, v3)
  MultiplyWordsLoHi(r5, z5, u2, v3)
  MultiplyWordsLoHi(r6, r7, u3, v3)
  AccAdd4WordsBy4_wc(z1, z3, z5, r7, z4, z6, z7)
  AccAdd4WordsBy4_wc(r4, r5, r6, r7, z1, z3, z5) //r4, r5, r6, r7 ok
  

  //Reduction, mod 2^256 - 0x1000003d1
  
  uint64_t p = 0x1000003d1;
  MulAcc_11(z1, z2, r0, r4, p)
  MultiplyWordsLoHi(z3, z4, r5, p)
  MultiplyWordsLoHi(z5, z6, r6, p)
  MultiplyWordsLoHi(z7, z8, r7, p)
  

  //MulAcc_11(z1, z2, r0, r4, p)
  AccAdd4WordsBy4_wc(z2, z4, z6, z8, r1, r2, r3)

  
  uint64_t c = 0;
  AccAdd4WordsBy4_wc(z3, z5, z7, z8, z2, z4, z6)
  MulAcc(c, z1, z3, p, z8)
  

  r[0] = z1;
  r[1] = z3;
 
  
  if(c == 1){

  asm (
     "addq $1, %0; adcq $0, %1; \n"
       : "=r" (z5), "=r" (z7)
       : : "cc");

  }
  
  r[2] = z5;
  r[3] = z7;
  
}

EDIT:


With 1 000 000 000 iterations:

mul without mod:

real   0m5,771s
user   0m5,770s
sys   0m0,000s



mul with mod:

real   0m8,438s
user   0m8,434s
sys   0m0,004s

Pieter Wuille
Legendary
*
qt
Offline Offline

Activity: 1072
Merit: 1174


View Profile WWW
December 26, 2022, 02:52:18 PM
Merited by vapourminer (1)
 #38

But since I use secp256k1 curve only for testing and research I do no care much for any of  possible vulnerabilities and attacks.

The safest (not necessary the fastest) secp256k1 is the one used in Bitcoin Core. But I don't use it because I keep getting wrong answers when I do arithmetic. Maybe the privkey bytes are not being filled correctly or something.

Perhaps you should open an issue or discussion topic (https://github.com/bitcoin-core/secp256k1/issues or https://github.com/bitcoin-core/secp256k1/discussions/categories/q-a) on the library, because that's definitely not supposed to happen.

I do Bitcoin stuff.
NotATether
Legendary
*
Offline Offline

Activity: 1596
Merit: 6728


bitcoincleanup.com / bitmixlist.org


View Profile WWW
December 26, 2022, 04:56:50 PM
 #39

But since I use secp256k1 curve only for testing and research I do no care much for any of  possible vulnerabilities and attacks.

The safest (not necessary the fastest) secp256k1 is the one used in Bitcoin Core. But I don't use it because I keep getting wrong answers when I do arithmetic. Maybe the privkey bytes are not being filled correctly or something.

Perhaps you should open an issue or discussion topic (https://github.com/bitcoin-core/secp256k1/issues or https://github.com/bitcoin-core/secp256k1/discussions/categories/q-a) on the library, because that's definitely not supposed to happen.

To update this thread with info I posted in the meantime, it appears to be that the multiplication has the secp256k1 characteristic modulus hardcoded into the algorithm, making it suitable only for public key multiplication.

I was thinking about making an adapted version of the multiplication algorithm that uses the curve order (subtracted from 2^256) in its place, so that private key multiplication is covered as well.

I haven't explored the 10x26 legs imply too deep, but I'm assuming it's a similar case.

Since you're here though, let me ask: Was the multiplication assembly (and possibly the C version of it in another file using int128) only intended for public keys?

.
.BLACKJACK ♠ FUN.
█████████
██████████████
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
████████████████
░██████████████
████████████
███████████████░██
██████████
CRYPTO CASINO &
SPORTS BETTING
▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
█████████
.
arulbero
Legendary
*
Offline Offline

Activity: 1915
Merit: 2074


View Profile
December 26, 2022, 06:48:47 PM
Last edit: December 26, 2022, 07:45:43 PM by arulbero
Merited by vapourminer (1)
 #40

it appears to be that the multiplication has the secp256k1 characteristic modulus hardcoded into the algorithm, making it suitable only for public key multiplication.

I was thinking about making an adapted version of the multiplication algorithm that uses the curve order (subtracted from 2^256) in its place, so that private key multiplication is covered as well.

I haven't explored the 10x26 legs imply too deep, but I'm assuming it's a similar case.

Since you're here though, let me ask: Was the multiplication assembly (and possibly the C version of it in another file using int128) only intended for public keys?

You have to distinguish between:

1) field operations (mod p, space of coordinates x and y):

https://github.com/bitcoin-core/secp256k1/tree/master/src/   all files with name : field*

and

2) scalar operations (mod n, space of private keys):

https://github.com/bitcoin-core/secp256k1/tree/master/src/  all files with name : scalar*


If you want to multiply 2 private keys (mod n):

you could use secp256k1_scalar_mul_512 (256bit * 256 bit -> 512 bit)  (assembly multiplication)

https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h#L587
Code:
inline void secp256k1_scalar_mul_512(uint64_t* l, const uint64_t *a,  const uint64_t *b) {
    
    //uint64_t l[8];
    
    __asm__ __volatile__ (
    /* Preload */
    "movq 0(%%rdi), %%r15\n"
    "movq 8(%%rdi), %%rbx\n"
    "movq 16(%%rdi), %%rcx\n"
    "movq 0(%%rdx), %%r11\n"
    "movq 8(%%rdx), %%r12\n"
    "movq 16(%%rdx), %%r13\n"
    "movq 24(%%rdx), %%r14\n"
    /* (rax,rdx) = a0 * b0 */
    "movq %%r15, %%rax\n"
    "mulq %%r11\n"
    /* Extract l0 */
    "movq %%rax, 0(%%rsi)\n"
    /* (r8,r9,r10) = (rdx) */
    "movq %%rdx, %%r8\n"
    "xorq %%r9, %%r9\n"
    "xorq %%r10, %%r10\n"
    /* (r8,r9,r10) += a0 * b1 */
    "movq %%r15, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* (r8,r9,r10) += a1 * b0 */
    "movq %%rbx, %%rax\n"
    "mulq %%r11\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* Extract l1 */
    "movq %%r8, 8(%%rsi)\n"
    "xorq %%r8, %%r8\n"
    /* (r9,r10,r8) += a0 * b2 */
    "movq %%r15, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* (r9,r10,r8) += a1 * b1 */
    "movq %%rbx, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* (r9,r10,r8) += a2 * b0 */
    "movq %%rcx, %%rax\n"
    "mulq %%r11\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* Extract l2 */
    "movq %%r9, 16(%%rsi)\n"
    "xorq %%r9, %%r9\n"
    /* (r10,r8,r9) += a0 * b3 */
    "movq %%r15, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* Preload a3 */
    "movq 24(%%rdi), %%r15\n"
    /* (r10,r8,r9) += a1 * b2 */
    "movq %%rbx, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* (r10,r8,r9) += a2 * b1 */
    "movq %%rcx, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* (r10,r8,r9) += a3 * b0 */
    "movq %%r15, %%rax\n"
    "mulq %%r11\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* Extract l3 */
    "movq %%r10, 24(%%rsi)\n"
    "xorq %%r10, %%r10\n"
    /* (r8,r9,r10) += a1 * b3 */
    "movq %%rbx, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* (r8,r9,r10) += a2 * b2 */
    "movq %%rcx, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* (r8,r9,r10) += a3 * b1 */
    "movq %%r15, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* Extract l4 */
    "movq %%r8, 32(%%rsi)\n"
    "xorq %%r8, %%r8\n"
    /* (r9,r10,r8) += a2 * b3 */
    "movq %%rcx, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* (r9,r10,r8) += a3 * b2 */
    "movq %%r15, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* Extract l5 */
    "movq %%r9, 40(%%rsi)\n"
    /* (r10,r8) += a3 * b3 */
    "movq %%r15, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    /* Extract l6 */
    "movq %%r10, 48(%%rsi)\n"
    /* Extract l7 */
    "movq %%r8, 56(%%rsi)\n"
    : "+d"(b)
    : "S"(l), "D"(a)
    : "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
}  

where for example

Code:
const uint64_t  a[4] =  {0x59f2815b16f81798, 0x029bfcdb2dce28d9, 0x55a06295ce870b07, 0x79be667ef9dcbbac};
const uint64_t  b[4] =  {0x9c47d08ffb10d4b8, 0xfd17b448a6855419, 0x5da4fbfc0e1108a8, 0x483ada7726a3c465};

uint64_t c[8] = {0};

secp256k1_scalar_mul_512(c, a, b);

printf("%016lx%016lx%016lx%016lx%016lx%016lx%016lx%016lx\n", c[7], c[6], c[5], c[4], c[3], c[2], c[1], c[0]);

Result:
225989dbbc349b6f319ca3eed777a46f55b1dc22e97af11261167d213c1f060d29520a21508989b06ed1194129efb1517cee385a708abe44718bc509775ad540

and then apply the function secp256k1_scalar_reduce_512 (from 512 bit to 256 bit mod n)

https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h#L274

to get the result mod n

https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h#L761-L765

Code:
805714a252d0c0b58910907e85b5b801fff610a36bdf46847a4bf5d9ae2d10ed

I got the same results with my function, with libsecp256k1 functions and with python3 too:

Code:
>>> n=0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141
>>> a=0x79be667ef9dcbbac55a06295ce870b07029bfcdb2dce28d959f2815b16f81798
>>> b=0x483ada7726a3c4655da4fbfc0e1108a8fd17b448a68554199c47d08ffb10d4b8
>>> hex(a*b)
'0x225989dbbc349b6f319ca3eed777a46f55b1dc22e97af11261167d213c1f060d29520a21508989b06ed1194129efb1517cee385a708abe44718bc509775ad540'
>>> hex(a*b % n)
'0x805714a252d0c0b58910907e85b5b801fff610a36bdf46847a4bf5d9ae2d10ed'
Pages: « 1 [2] 3 »  All
  Print  
 
Jump to:  

Powered by MySQL Powered by PHP Powered by SMF 1.1.19 | SMF © 2006-2009, Simple Machines Valid XHTML 1.0! Valid CSS!