secp256k1 library in pure assembly

ecdsa123 (OP)

Full Member

Offline

Activity: 211
Merit: 105

Dr WHO on disney+

Re: secp256k1 library in pure assembly

December 03, 2022, 07:33:02 PM
Last edit: December 03, 2022, 09:10:10 PM by ecdsa123

#21

why you will not use xmm and xmmword? it is 128 bit? less code, less operation, fastest

when I finish i will upload.

the problem is with modulo for my self:)

try something like:

Code:

    %define arg1f XMM0
    %define arg2f XMM1
    %define arg3f XMM2
    %define arg4f XMM3
    %define arg1 RDI
    %define arg2 RSI
    %define arg3 RDX
    %define arg4 RC
    %define arg5 R8
    %define arg6 R9 


%macro MULT 1,2,3
;Multi XMM0:XMM1:XMM4:XMM5 by XMM2:XMM3:XMM6:XMM7
        movups arg3f,[%1+%3]
        movaps xmm7,arg3f

        mulps  arg3f,arg1f
        movshdup    arg4f, arg3f
        addps       arg3f, arg4f
        movaps xmm6,xmm7
        movhlps     arg4f, arg3f
        addss       arg3f, arg4f
        movss  [%2+%3], arg3f;

        mulps  xmm6,arg2f
        movshdup    arg4f, xmm6
        addps       xmm6, arg4f
        movaps arg3f,xmm7
        movhlps     arg4f, xmm6
        addss       xmm6, arg4f
        movss  [%2+4+%3], xmm6 

        mulps  arg3f,xmm4
        movshdup    arg4f, arg3f
        addps       arg3f, arg4f
        movaps xmm6,xmm7
        movhlps     arg4f, arg3f
        addss       arg3f, arg4f
        movss  [%2+8+%3], arg3f

        mulps  xmm6,xmm5
        movshdup    arg4f, xmm6
        addps       xmm6, arg4f
        movhlps     arg4f, xmm6
        addss       xmm6, arg4f
        movss  [%2+8+4+%3], xmm6

%endmacro

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions

NotATether

Legendary

Offline

Activity: 1596
Merit: 6728

bitcoincleanup.com / bitmixlist.org

Re: secp256k1 library in pure assembly

December 04, 2022, 04:57:42 PM

#22

Quote from: ecdsa123 on December 03, 2022, 07:33:02 PM

why you will not use xmm and xmmword? it is 128 bit? less code, less operation, fastest

when I finish i will upload.

the problem is with modulo for my self:)

try something like:

Code:

    %define arg1f XMM0
    %define arg2f XMM1
    %define arg3f XMM2
    %define arg4f XMM3
    %define arg1 RDI
    %define arg2 RSI
    %define arg3 RDX
    %define arg4 RC
    %define arg5 R8
    %define arg6 R9 


%macro MULT 1,2,3
;Multi XMM0:XMM1:XMM4:XMM5 by XMM2:XMM3:XMM6:XMM7
        movups arg3f,[%1+%3]
        movaps xmm7,arg3f

        mulps  arg3f,arg1f
        movshdup    arg4f, arg3f
        addps       arg3f, arg4f
        movaps xmm6,xmm7
        movhlps     arg4f, arg3f
        addss       arg3f, arg4f
        movss  [%2+%3], arg3f;

        mulps  xmm6,arg2f
        movshdup    arg4f, xmm6
        addps       xmm6, arg4f
        movaps arg3f,xmm7
        movhlps     arg4f, xmm6
        addss       xmm6, arg4f
        movss  [%2+4+%3], xmm6 

        mulps  arg3f,xmm4
        movshdup    arg4f, arg3f
        addps       arg3f, arg4f
        movaps xmm6,xmm7
        movhlps     arg4f, arg3f
        addss       arg3f, arg4f
        movss  [%2+8+%3], arg3f

        mulps  xmm6,xmm5
        movshdup    arg4f, xmm6
        addps       xmm6, arg4f
        movhlps     arg4f, xmm6
        addss       xmm6, arg4f
        movss  [%2+8+4+%3], xmm6

%endmacro

Are you saving some bits at the end of each dword for the carry? Otherwise you're going to lose accuracy in the final answer, because SIMD stuff will just overflow instead of carry.

I was thinking of applying the excellent technique of 5x52-bit numbers used in libsecp256k1, where we use 5 quadwords that each hold 52 bits of the real number each (except for the most significant qword which holds less). There's also a 10x26-bit variant for 32-bit machines.

If that technique is applied, we can defer the carry and add the numbers hundreds of times without corruption.

We would only need 3 SSE adds, or 2 AVX adds in that case. But I have heard that using the AVX instructions incurs some kind of speed penalty like AVX-512?

An alternative is just stuffing them in the R8-R15 registers and RAX/RBX for the 5 quadwords for both operands and then clobber one of the operands with the sum.

.
.^{BLACKJACK^♠FUN.}

███▄██████
██████████████▀
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
█████████████████▄
░██████████████▀
████████████
███████████████░██
██████████

CRYPTO CASINO &
SPORTS BETTING

│

.
REWARD SYSTEM
.ELITE SEAT..

│

▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
███████████████████

.

Whitepaper
JACK

j2002ba2

Full Member

Offline

Activity: 204
Merit: 437

Re: secp256k1 library in pure assembly

December 05, 2022, 05:48:07 PM

#23

Quote from: albert0bsd on December 03, 2022, 02:38:24 PM

Ok, lets to to write it by parts using openAI

Code:

...
    addq num2(,%rax,8), %rdx
...

As usual, the so called AI gave wrong result. The carry flag is completely ignored, so instead it adds two vectors, each having four 64bit numbers.

NotATether

Legendary

Offline

Activity: 1596
Merit: 6728

bitcoincleanup.com / bitmixlist.org

Re: secp256k1 library in pure assembly

December 06, 2022, 03:39:53 AM

#24

Quote from: j2002ba2 on December 05, 2022, 05:48:07 PM

Quote from: albert0bsd on December 03, 2022, 02:38:24 PM

Ok, lets to to write it by parts using openAI

Code:

...
    addq num2(,%rax,8), %rdx
...

As usual, the so called AI gave wrong result. The carry flag is completely ignored, so instead it adds two vectors, each having four 64bit numbers.

This is a recurring pattern I've seen in the AI-generated results. Nearly all of them have comments, so I know that the ones that don't talk about the point additions expressions are wrong, and there are also samples where I saw arithmetic using just "eax" instead of the necessary 4/5 64-bit registers.

There were instances where I got merely 256-bit number addition instead of point addition.

.
.^{BLACKJACK^♠FUN.}

███▄██████
██████████████▀
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
█████████████████▄
░██████████████▀
████████████
███████████████░██
██████████

CRYPTO CASINO &
SPORTS BETTING

│

.
REWARD SYSTEM
.ELITE SEAT..

│

▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
███████████████████

.

Whitepaper
JACK

ecdsa123 (OP)

Full Member

Offline

Activity: 211
Merit: 105

Dr WHO on disney+

Re: secp256k1 library in pure assembly

December 21, 2022, 07:42:25 PM

#25

I have done implement parts of secp256k1 in pure asm

I would like to inform that perofrmence is fucking fast (for me):

100 000 000 performs modulo n on secp256k1 vals:

ubuntu@ubuntu2004:~/Downloads/ma$ nasm -f elf64 main.asm
ubuntu@ubuntu2004:~/Downloads/ma$ ld -s -o main main.o
ubuntu@ubuntu2004:~/Downloads/ma$ time ./main

real   0m4.856s
user   0m4.850s
sys   0m0.005s
ubuntu@ubuntu2004:~/Downloads/ma$

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions

albert0bsd

Hero Member

Offline

Activity: 853
Merit: 662

Re: secp256k1 library in pure assembly

December 22, 2022, 02:30:38 AM

Merited by PowerGlove (1)

#26

Quote from: ecdsa123 on December 21, 2022, 07:42:25 PM

I have done implement parts of secp256k1 in pure asm

I would like to inform that perofrmence is fucking fast (for me):

100 000 000 performs modulo n on secp256k1 vals:

Can you explain what kind of numbers are those 100 Million inputs. What is your hardware and against what other implementation are you comparing your results?

If you value your BTC privacy just use Mixers
Universal Declaration of Human Rights; Article 12: No one shall be subjected to arbitrary interference with his privacy.

NotATether

Legendary

Offline

Activity: 1596
Merit: 6728

bitcoincleanup.com / bitmixlist.org

Re: secp256k1 library in pure assembly

December 22, 2022, 09:51:35 AM

#27

Quote from: AlexanderCurl on December 22, 2022, 05:17:29 AM

And moreover VanitySeacrh is only fast at "Point pub = secp256k1->ComputePublicKey(&privKey);" since it generates a "Point GTable[256*32];"
to be used in "Q = Add2(Q, GTable[256 * i + (b-1)])" having Jacobian Coordinates representation as calculation scheme.
If you take Add2 to add one million points in a sequence it is faster than classic point_addition with inversion by far.
But to use any Jacobian point after calculation further you need to use "Q.Reduce();" beforehand the same thing as inversion.
And in that situation classic scheme using GMP becomes faster since you just add point and can use it further immediately without any Reduce().

That's a sort of tradeoff you'll have when using data structures which can hold additional overflow at the cost of requiring a "rebuild" later. But most applications that do many hetrogeneous arithmetic ops should look into writing a function to sort of do it all at once with some optimization if possible. Just so that you can retain the benefit of using overflow structures.

.
.^{BLACKJACK^♠FUN.}

███▄██████
██████████████▀
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
█████████████████▄
░██████████████▀
████████████
███████████████░██
██████████

CRYPTO CASINO &
SPORTS BETTING

│

.
REWARD SYSTEM
.ELITE SEAT..

│

▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
███████████████████

.

Whitepaper
JACK

NotATether

Legendary

Offline

Activity: 1596
Merit: 6728

bitcoincleanup.com / bitmixlist.org

Re: secp256k1 library in pure assembly

December 22, 2022, 12:59:05 PM

#28

Quote from: AlexanderCurl on December 22, 2022, 10:47:07 AM

Quote from: NotATether on December 22, 2022, 09:51:35 AM

Quote from: AlexanderCurl on December 22, 2022, 05:17:29 AM

And moreover VanitySeacrh is only fast at "Point pub = secp256k1->ComputePublicKey(&privKey);" since it generates a "Point GTable[256*32];"
to be used in "Q = Add2(Q, GTable[256 * i + (b-1)])" having Jacobian Coordinates representation as calculation scheme.
If you take Add2 to add one million points in a sequence it is faster than classic point_addition with inversion by far.
But to use any Jacobian point after calculation further you need to use "Q.Reduce();" beforehand the same thing as inversion.
And in that situation classic scheme using GMP becomes faster since you just add point and can use it further immediately without any Reduce().

That's a sort of tradeoff you'll have when using data structures which can hold additional overflow at the cost of requiring a "rebuild" later. But most applications that do many hetrogeneous arithmetic ops should look into writing a function to sort of do it all at once with some optimization if possible. Just so that you can retain the benefit of using overflow structures.

Different projective points representations boost scalar multiplication only but not points addition.
That is all very interesting. But can you point to some source code to see that in action.
In case with GMP performance boost comes when you put local variables of functions to global scope.
So that you initialize them at the start and clear in the end. And functions use them directly for reading and writing throughout the program run.

Libsecp256k1 that is used in Bitcoin Core has two special-purpose structs for adding & multiplying a point many times before requiring a rebuild. https://github.com/bitcoin-core/secp256k1/blob/master/src/field_5x52.h and https://github.com/bitcoin-core/secp256k1/blob/master/src/field_10x26.h

These are special because there is no intermediate "modulus" during any of the operations, so they are quite fast. Unfortunately these functions are only correct for public keys (as the modulus is hardcoded), and you'd need to write a different set of functions to operate with private keys.

What I'm thinking is: Imagine if you need to perform the following (arbitrary) sequence:

E = a*G + b*G + c*G + d*G

I am multiplying 4 times and adding three times.

Now we could do (a+b+c+d)*G, and since these functions can last for a number of ops without a rebuild, I should be safe doing a Montgomery ladder for the multiplication by G, with this sum. Instead of multiplying 4 times I only multiply one time.

Now let's look at something I just pasted from JeanLucPons Kangaroo:

Code:

      dy.ModSub(p2y,p1y);
      _s.ModMulK1(&dy,&dx[g]);
      _p.ModSquareK1(&_s);

      rx.ModSub(&_p,p1x);
      rx.ModSub(p2x);

      ry.ModSub(p2x,&rx);
      ry.ModMulK1(&_s);
      ry.ModSub(p2y);

It roughly translates to (((p2y-p1y) * dx)^2 - p1x - p2x) = rx
and (p2x - (((p2y-p1y) * dx)^2 - p1x - p2x)) * ((p2y-p1y) * dx) - p2y = ry

That's 6 subtractions, 3 multiplications, and one squaring in total.

What if there were a way to optimize this calculation through bit shifts and the like, just like how libsecp256k1 multiplies two numbers with not only muls and adds but also bitwise arithmetic?

We have a pattern: (a - b) * c which occurs twice in this calculation in total.

Rather than consuming a subtraction and multiplication, I wonder if there was a way to calculate them at once (not just concatenate them together).

That would be incredible.

.
.^{BLACKJACK^♠FUN.}

███▄██████
██████████████▀
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
█████████████████▄
░██████████████▀
████████████
███████████████░██
██████████

CRYPTO CASINO &
SPORTS BETTING

│

.
REWARD SYSTEM
.ELITE SEAT..

│

▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
███████████████████

.

Whitepaper
JACK

arulbero

Legendary

Offline

Activity: 1915
Merit: 2074

Re: secp256k1 library in pure assembly

December 22, 2022, 05:33:57 PM
Last edit: December 22, 2022, 05:45:19 PM by arulbero

#29

Quote from: NotATether on December 22, 2022, 12:59:05 PM

Now let's look at something I just pasted from JeanLucPons Kangaroo:

Code:

      dy.ModSub(p2y,p1y);
      _s.ModMulK1(&dy,&dx[g]);
      _p.ModSquareK1(&_s);

      rx.ModSub(&_p,p1x);
      rx.ModSub(p2x);

      ry.ModSub(p2x,&rx);
      ry.ModMulK1(&_s);
      ry.ModSub(p2y);

It roughly translates to (((p2y-p1y) * dx)^2 - p1x - p2x) = rx
and (p2x - (((p2y-p1y) * dx)^2 - p1x - p2x)) * ((p2y-p1y) * dx) - p2y = ry

That's 6 subtractions, 3 multiplications, and one squaring in total.

What if there were a way to optimize this calculation through bit shifts and the like, just like how libsecp256k1 multiplies two numbers with not only muls and adds but also bitwise arithmetic?

We have a pattern: (a - b) * c which occurs twice in this calculation in total.

Rather than consuming a subtraction and multiplication, I wonder if there was a way to calculate them at once (not just concatenate them together).

The point addition formula is:

Code:

Point Addition for X₁ ≠ X₂             #


            s = (y2 - y1) / (x2 - x1)
            x3 = s ** 2 - x1 - x2
            y3 = s * (x1 - x3) - y1

If you have to generate a batch of public keys with same distance, for example B = 10*G, 15*G, 20*G, 25*G ...

you can generate only x coordinates (y coordinates are not necesssary) and compute only 1 inversion for the entire batch B; you don't need Montgomery ladder, projective coordinates, 5x52 limbs, ...

To generate 1000 public keys you need to compute:

1000 squares --> s**2
1000 multiplications (y2-y1) * 1/(x2-x1)
1 inversion
1500 multiplications --> to get 1/(x2-x1)

tot: 2500 mult + 1000 sqr + 1 inv to compute 1000 public keys -> 2.5 mul + 1 sqr + 4 subtractions for each key.

ecdsa123 (OP)

Full Member

Offline

Activity: 211
Merit: 105

Dr WHO on disney+

Re: secp256k1 library in pure assembly

December 22, 2022, 05:39:17 PM

#30

Quote from: albert0bsd on December 22, 2022, 02:30:38 AM

Quote from: ecdsa123 on December 21, 2022, 07:42:25 PM

I have done implement parts of secp256k1 in pure asm

I would like to inform that perofrmence is fucking fast (for me):

100 000 000 performs modulo n on secp256k1 vals:

Can you explain what kind of numbers are those 100 Million inputs. What is your hardware and against what other implementation are you comparing your results?

so: test was performed on:
Intel Core i5-1240P - without multithreading - on just one core.
Power set up on balance (not maximum performance)

test has been done on:

xor rcx,rcx
_loop:
x*x mod n
so first multiply x by x ( x is value Point of generator Secp256k1 ( i mean the x value of G(x,y)) then mod n
so 256 bit *256 bit value then mod n

inc rcx
cmp rcx,100 000 000
jne _loop

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions

ecdsa123 (OP)

Full Member

Offline

Activity: 211
Merit: 105

Dr WHO on disney+

Re: secp256k1 library in pure assembly

December 22, 2022, 07:20:42 PM

#31

Quote from: arulbero on December 22, 2022, 05:33:57 PM

Quote from: NotATether on December 22, 2022, 12:59:05 PM

Now let's look at something I just pasted from JeanLucPons Kangaroo:

Code:

      dy.ModSub(p2y,p1y);
      _s.ModMulK1(&dy,&dx[g]);
      _p.ModSquareK1(&_s);

      rx.ModSub(&_p,p1x);
      rx.ModSub(p2x);

      ry.ModSub(p2x,&rx);
      ry.ModMulK1(&_s);
      ry.ModSub(p2y);

It roughly translates to (((p2y-p1y) * dx)^2 - p1x - p2x) = rx
and (p2x - (((p2y-p1y) * dx)^2 - p1x - p2x)) * ((p2y-p1y) * dx) - p2y = ry

That's 6 subtractions, 3 multiplications, and one squaring in total.

What if there were a way to optimize this calculation through bit shifts and the like, just like how libsecp256k1 multiplies two numbers with not only muls and adds but also bitwise arithmetic?

We have a pattern: (a - b) * c which occurs twice in this calculation in total.

Rather than consuming a subtraction and multiplication, I wonder if there was a way to calculate them at once (not just concatenate them together).

The point addition formula is:

Code:

Point Addition for X₁ ≠ X₂             #


            s = (y2 - y1) / (x2 - x1)
            x3 = s ** 2 - x1 - x2
            y3 = s * (x1 - x3) - y1

If you have to generate a batch of public keys with same distance, for example B = 10*G, 15*G, 20*G, 25*G ...

you can generate only x coordinates (y coordinates are not necesssary) and compute only 1 inversion for the entire batch B; you don't need Montgomery ladder, projective coordinates, 5x52 limbs, ...

To generate 1000 public keys you need to compute:

1000 squares --> s**2
1000 multiplications (y2-y1) * 1/(x2-x1)
1 inversion
1500 multiplications --> to get 1/(x2-x1)

tot: 2500 mult + 1000 sqr + 1 inv to compute 1000 public keys -> 2.5 mul + 1 sqr + 4 subtractions for each key.

this is intresting

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions

NotATether

Legendary

Offline

Activity: 1596
Merit: 6728

bitcoincleanup.com / bitmixlist.org

Re: secp256k1 library in pure assembly

December 23, 2022, 04:05:06 AM

#32

Quote from: arulbero on December 22, 2022, 05:33:57 PM

The point addition formula is:

Code:

Point Addition for X_1 != X_2             #


            s = (y2 - y1) / (x2 - x1)
            x3 = s ** 2 - x1 - x2
            y3 = s * (x1 - x3) - y1

If you have to generate a batch of public keys with same distance, for example B = 10*G, 15*G, 20*G, 25*G ...

you can generate only x coordinates (y coordinates are not necesssary) and compute only 1 inversion for the entire batch B; you don't need Montgomery ladder, projective coordinates, 5x52 limbs, ...

To generate 1000 public keys you need to compute:

1000 squares --> s**2
1000 multiplications (y2-y1) * 1/(x2-x1)
1 inversion
1500 multiplications --> to get 1/(x2-x1)

tot: 2500 mult + 1000 sqr + 1 inv to compute 1000 public keys -> 2.5 mul + 1 sqr + 4 subtractions for each key.

It is not clear to me to what this 1 inversion would be applied to. At a first glance, I only see the (x2 - x1) being inverted, but it's not clear how you'd do the inversion last, or only once, because x1 && x2 will usually not both be constant.

.
.^{BLACKJACK^♠FUN.}

███▄██████
██████████████▀
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
█████████████████▄
░██████████████▀
████████████
███████████████░██
██████████

CRYPTO CASINO &
SPORTS BETTING

│

.
REWARD SYSTEM
.ELITE SEAT..

│

▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
███████████████████

.

Whitepaper
JACK

arulbero

Legendary

Offline

Activity: 1915
Merit: 2074

Re: secp256k1 library in pure assembly

December 23, 2022, 07:56:04 AM

Merited by NotATether (5)

#33

Quote from: NotATether on December 23, 2022, 04:05:06 AM

It is not clear to me to what this 1 inversion would be applied to. At a first glance, I only see the (x2 - x1) being inverted, but it's not clear how you'd do the inversion last, or only once, because x1 && x2 will usually not both be constant.

Of course you have to get many different inversions, 1/(x2-x1), 1/(x2'-x1') and so on, but you can get these values by computing only one inversion for the entire group.

The idea is:

imagine you need 1/a and 1/b

you don't want to compute 2 inversions, because inversion is a very expensive operation.

Then:

1) compute a*b -> 1 multiplication
2) compute 1/(a*b) -> 1 inversion, you invert only a*b
3) compute 1((a*b) * b = 1/a -> 1 multiplication
4) compute 1/(a*b) * a = 1/b -> 1 multiplication

now you got 2 inversions (1/a and 1/b) computing only 3 multiplications and 1 'real' inversion.

If you have more than 2 elements, for example a,b,c,d,e

1) compute a*b -> 1 mul (you have to store this result)
2) compute (a*b)*c -> 1 mul (you have to store this result)
3) compute (a*b*c)*d -> 1 mul (you have to store this result)
4) compute (a*b*c*d)*e -> 1mul (you have to store this result)
5) compute 1/(a*b*c*d*e) -> 1 inv

--> about 1 mul for each element + 1 inversion -> (n-1)mul + 1 inv

6) compute 1/(a*b*c*d*e) * (a*b*c*d) = 1/e -> 1 mul
7) compute 1/(a*b*c*d*e) * e = 1/(a*b*c*d) -> 1 mul

8 ) compute 1/(a*b*c*d) * (a*b*c) = 1/d -> 1 mul
9) compute 1/(a*b*c*d) * d = 1/(a*b*c) -> 1 mul

10) compute 1/(a*b*c) * (a*b) = 1/c -> 1 mul
11) compute 1/(a*b*c) * c = 1/(a*b) -> 1 mul

12) compute 1/(a*b) * (a) = 1/b -> 1 mul
13) compute 1/(a*b) * (b) = 1/a -> 1 mul

-> 2*(n-2) + 2 = 2*(n-1) mul

If you do the same with a very long batch,

you need then 3*(n-1) mul + only 1 real inversion, in this way you minimize the impact of the real inversion.

On average you can consider then 3 mul for each element to invert.

-------------------------------------------------------------------------------------------------------------------------------------------------------

The function that implements this idea in Vanitysearch is 'ModInv' :

https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.cpp/#L35

first n-1 mul -> https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.cpp/#L42-L44

the only one real inversion -> https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.cpp/#L48

second and third n-1 mul -> https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.cpp/#L50-L54

that function belongs to the class IntGroup:

https://github.com/JeanLucPons/VanitySearch/blob/master/IntGroup.h#L24

This trick works only if you know many 'x1' and 'x2' before starting the computation.

ecdsa123 (OP)

Full Member

Offline

Activity: 211
Merit: 105

Dr WHO on disney+

Re: secp256k1 library in pure assembly

December 24, 2022, 09:59:45 AM

#34

intresting things:

I'm plying in pure asm:

during multiply x by x , but combination of size of registers I got:

x is our X of Generator point.

Code:

SECTION .text
	GLOBAL _start
	
_start:
	
	call def_Point_Addition 
	
    ; Terminate program
	mov rax,1            ; 'exit' system call
	mov rbx,0            ; exit with error code 0
	int 80h
	

def_Point_Addition:           ;Point Addition for P1= P2   (tangent with slope) 
	mov512 num_1_512,x_256
	mov512 num_2_512,x_256
	mov rdi,num_1_512
	mov rsi,num_2_512
	call mul_u256s
	 
	call print_u512     ; print n1 after addition
	call print_nl
	ret

I get output:

buntu@ubuntu2004:/mnt/hgfs/plik/asm$ nasm -f elf64 proba.asm
ubuntu@ubuntu2004:/mnt/hgfs/plik/asm$ ld -s -o proba proba.o
ubuntu@ubuntu2004:/mnt/hgfs/plik/asm$ time ./proba
483ada7726a3c4655da4fbfc0e1108a8fd17b448a68554199c47d08ffb10d4b85dbbf79179d08fe 08f9a320029fccfe5423baf1bc4ca9debda56d7f7609edd60

real   0m0.004s
user   0m0.001s
sys   0m0.001s
ubuntu@ubuntu2004:/mnt/hgfs/plik/asm$

the first of 64 bytes are y of x!

I do not understand:)

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions

arulbero

Legendary

Offline

Activity: 1915
Merit: 2074

Re: secp256k1 library in pure assembly

December 24, 2022, 11:23:28 AM

#35

Quote from: ecdsa123 on December 22, 2022, 05:39:17 PM

Quote from: albert0bsd on December 22, 2022, 02:30:38 AM

Quote from: ecdsa123 on December 21, 2022, 07:42:25 PM

I have done implement parts of secp256k1 in pure asm

I would like to inform that performance is fucking fast (for me):

100 000 000 performs modulo n on secp256k1 vals:

ubuntu@ubuntu2004:~/Downloads/ma$ nasm -f elf64 main.asm
ubuntu@ubuntu2004:~/Downloads/ma$ ld -s -o main main.o
ubuntu@ubuntu2004:~/Downloads/ma$ time ./main

real   0m4.856s
user   0m4.850s
sys   0m0.005s
ubuntu@ubuntu2004:~/Downloads/ma$

Can you explain what kind of numbers are those 100 Million inputs. What is your hardware and against what other implementation are you comparing your results?

so: test was performed on:
Intel Core i5-1240P - without multithreading - on just one core.
Power set up on balance (not maximum performance)

test has been done on:

xor rcx,rcx
_loop:
x*x mod n
so first multiply x by x ( x is value Point of generator Secp256k1 ( i mean the x value of G(x,y)) then mod n
so 256 bit *256 bit value then mod n

inc rcx
cmp rcx,100 000 000
jne _loop

I tried a similar test with my mul function:

Code:

int main(){

 uint64_t  x[4] =  {0x59f2815b16f81798, 0x029bfcdb2dce28d9, 0x55a06295ce870b07, 0x79be667ef9dcbbac};
 uint64_t  a[4] =  {0x59f2815b16f81798, 0x029bfcdb2dce28d9, 0x55a06295ce870b07, 0x79be667ef9dcbbac};


 uint64_t* ptrx = &x[0];
 uint64_t* ptra = &a[0];

 uint32_t i;

 for(i=0; i < 100000000; i++){
     mul(ptrx,ptra,ptrx);  //a*x -> x
 }
 

}

12th Gen Intel(R) Core(TM) i7-12700H - without multithreading - on just one core.
Power set up on balance (not maximum performance)

$ gcc -O2 main.c

$ time ./a.out

real   0m0,886s
user   0m0,882s
sys   0m0,005s

ecdsa123 (OP)

Full Member

Offline

Activity: 211
Merit: 105

Dr WHO on disney+

Re: secp256k1 library in pure assembly

December 24, 2022, 12:10:55 PM

#36

could you add mod n after mult?

i'm intresting to see the benchmark.
in your example we have only multiply without mod n

Donate: bc1q0sezldfgm7rf2r78p5scasrrcfkpzxnrfcvdc6

Subscribe : http://www.youtube.com/@Ecdsa_Solutions

arulbero

Legendary

Offline

Activity: 1915
Merit: 2074

Re: secp256k1 library in pure assembly

December 24, 2022, 12:34:40 PM
Last edit: December 24, 2022, 01:29:53 PM by arulbero

Merited by vapourminer (1), n0nce (1), citb0in (1)

#37

Quote from: ecdsa123 on December 24, 2022, 12:10:55 PM

could you add mod n after mult?

i'm intresting to see the benchmark.
in your example we have only multiply without mod n

My mul function has mod too.

Code:

// compute a * b = (low. high)
#define MultiplyWordsLoHi(low, high, a, b) asm  ( "mulx  %2, %0, %1;" : "=r"(low), "=r"(high) :  "gr" (a), "d" (b) : "cc");

#define AccAdd4WordsBy4_wc(a0, a1, a2, a3, b0, b1, b2)   asm  ("addq %4, %0; adcx %5, %1; adcx %6, %2; adcq $0, %3;" : "+r"(a0), "+r"(a1), "+r"(a2), "+r"(a3) : "r"(b0), "r"(b1), "r"(b2) : "cc");
// (a0, a1, a2, a3) = (a0, a1, a2, a3) + (b0, b1, b2, 0) without carry 

#define MulAcc(c, a0, a1, a, b) asm      ("mulx %3, %3, %4; addq %3, %1; adcq %4, %2; adcq $0, %0;" : "+r"(c), "+r"(a0), "+r"(a1), "=a"(a), "=d"(b) : "a"(a), "d"(b) : "cc");

#define MulAcc_11(a0, a1, c0, a, b)      asm ("mulx %3, %0, %1; addq %2, %0; adcq $0, %1;" : "+r"(a0), "+r"(a1): "r"(c0), "r"(a), "d"(b) : "cc");



// compute u*v mod p
inline void
mul(uint64_t *r, uint64_t *u, uint64_t *v) {
  
  uint64_t u0 = u[0];
  uint64_t u1 = u[1];
  uint64_t u2 = u[2];
  uint64_t u3 = u[3];


  uint64_t v0 = v[0];
  uint64_t v1 = v[1];
  uint64_t v2 = v[2];
  uint64_t v3 = v[3];

  uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
  uint64_t z1, z2, z3, z4, z5, z6, z7, z8, z44, z66;

  z2 = z3 = z4 = z5 = z6 = z7 = z8 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0;


  MultiplyWordsLoHi(r0, z1, u0, v0) //x1 --> r0 ok
  MultiplyWordsLoHi(z2, z3, u1, v0)
  MultiplyWordsLoHi(z4, z5, u2, v0)
  MultiplyWordsLoHi(z6, z7, u3, v0)
  AccAdd4WordsBy4_wc(z2, z4, z6, z7, z1, z3, z5)


  MulAcc_11(r1, z1, z2, u0, v1) //x1 --> r1 ok
  MultiplyWordsLoHi(z2, z3, u1, v1)
  MultiplyWordsLoHi(z44, z5, u2, v1)
  MultiplyWordsLoHi(z66, z8, u3, v1)
  AccAdd4WordsBy4_wc(z1, z3, z5, z8, z4, z6, z7)
  AccAdd4WordsBy4_wc(z2, z44, z66, z8, z1, z3, z5)

  
  MulAcc_11(r2, z1, z2, u0, v2) //x1 --> r2 ok
  MultiplyWordsLoHi(z2, z3, u1, v2)
  MultiplyWordsLoHi(z4, z5, u2, v2)
  MultiplyWordsLoHi(z6, z7, u3, v2)
  AccAdd4WordsBy4_wc(z1, z3, z5, z7, z44, z66, z8)
  AccAdd4WordsBy4_wc(z2, z4, z6, z7, z1, z3, z5)

  MulAcc_11(r3, z1, z2, u0, v3) //x1 --> r3 ok
  MultiplyWordsLoHi(r4, z3, u1, v3)
  MultiplyWordsLoHi(r5, z5, u2, v3)
  MultiplyWordsLoHi(r6, r7, u3, v3)
  AccAdd4WordsBy4_wc(z1, z3, z5, r7, z4, z6, z7)
  AccAdd4WordsBy4_wc(r4, r5, r6, r7, z1, z3, z5) //r4, r5, r6, r7 ok
  

  //Reduction, mod 2^256 - 0x1000003d1
  
  uint64_t p = 0x1000003d1;
  MulAcc_11(z1, z2, r0, r4, p) 
  MultiplyWordsLoHi(z3, z4, r5, p)
  MultiplyWordsLoHi(z5, z6, r6, p)
  MultiplyWordsLoHi(z7, z8, r7, p)
  

  //MulAcc_11(z1, z2, r0, r4, p) 
  AccAdd4WordsBy4_wc(z2, z4, z6, z8, r1, r2, r3)

  
  uint64_t c = 0;
  AccAdd4WordsBy4_wc(z3, z5, z7, z8, z2, z4, z6)
  MulAcc(c, z1, z3, p, z8)
  

  r[0] = z1;
  r[1] = z3;
 
  
  if(c == 1){

  asm (
     "addq $1, %0; adcq $0, %1; \n"
       : "=r" (z5), "=r" (z7) 
       : : "cc");

  }
  
  r[2] = z5;
  r[3] = z7;
   
}

EDIT:

With 1 000 000 000 iterations:

mul without mod:

real   0m5,771s
user   0m5,770s
sys   0m0,000s

mul with mod:

real   0m8,438s
user   0m8,434s
sys   0m0,004s

Pieter Wuille

Legendary

Offline

Activity: 1072
Merit: 1174

Re: secp256k1 library in pure assembly

December 26, 2022, 02:52:18 PM

Merited by vapourminer (1)

#38

Quote from: NotATether on December 02, 2022, 05:42:43 PM

Quote from: AlexanderCurl on December 02, 2022, 05:39:30 PM

But since I use secp256k1 curve only for testing and research I do no care much for any of possible vulnerabilities and attacks.

The safest (not necessary the fastest) secp256k1 is the one used in Bitcoin Core. But I don't use it because I keep getting wrong answers when I do arithmetic. Maybe the privkey bytes are not being filled correctly or something.

Perhaps you should open an issue or discussion topic (https://github.com/bitcoin-core/secp256k1/issues or https://github.com/bitcoin-core/secp256k1/discussions/categories/q-a) on the library, because that's definitely not supposed to happen.

I do Bitcoin stuff.

NotATether

Legendary

Offline

Activity: 1596
Merit: 6728

bitcoincleanup.com / bitmixlist.org

Re: secp256k1 library in pure assembly

December 26, 2022, 04:56:50 PM

#39

Quote from: Pieter Wuille on December 26, 2022, 02:52:18 PM

Quote from: NotATether on December 02, 2022, 05:42:43 PM

Quote from: AlexanderCurl on December 02, 2022, 05:39:30 PM

But since I use secp256k1 curve only for testing and research I do no care much for any of possible vulnerabilities and attacks.

The safest (not necessary the fastest) secp256k1 is the one used in Bitcoin Core. But I don't use it because I keep getting wrong answers when I do arithmetic. Maybe the privkey bytes are not being filled correctly or something.

Perhaps you should open an issue or discussion topic (https://github.com/bitcoin-core/secp256k1/issues or https://github.com/bitcoin-core/secp256k1/discussions/categories/q-a) on the library, because that's definitely not supposed to happen.

To update this thread with info I posted in the meantime, it appears to be that the multiplication has the secp256k1 characteristic modulus hardcoded into the algorithm, making it suitable only for public key multiplication.

I was thinking about making an adapted version of the multiplication algorithm that uses the curve order (subtracted from 2^256) in its place, so that private key multiplication is covered as well.

I haven't explored the 10x26 legs imply too deep, but I'm assuming it's a similar case.

Since you're here though, let me ask: Was the multiplication assembly (and possibly the C version of it in another file using int128) only intended for public keys?

.
.^{BLACKJACK^♠FUN.}

███▄██████
██████████████▀
████████████
█████████████████
████████████████▄▄
░█████████████▀░▀▀
██████████████████
░██████████████
█████████████████▄
░██████████████▀
████████████
███████████████░██
██████████

CRYPTO CASINO &
SPORTS BETTING

│

.
REWARD SYSTEM
.ELITE SEAT..

│

▄▄███████▄▄
▄███████████████▄
███████████████████
█████████████████████
███████████████████████
█████████████████████████
█████████████████████████
█████████████████████████
███████████████████████
█████████████████████
███████████████████
▀███████████████▀
███████████████████

.

Whitepaper
JACK

arulbero

Legendary

Offline

Activity: 1915
Merit: 2074

Re: secp256k1 library in pure assembly

December 26, 2022, 06:48:47 PM
Last edit: December 26, 2022, 07:45:43 PM by arulbero

Merited by vapourminer (1)

#40

Quote from: NotATether on December 26, 2022, 04:56:50 PM

it appears to be that the multiplication has the secp256k1 characteristic modulus hardcoded into the algorithm, making it suitable only for public key multiplication.

I was thinking about making an adapted version of the multiplication algorithm that uses the curve order (subtracted from 2^256) in its place, so that private key multiplication is covered as well.

I haven't explored the 10x26 legs imply too deep, but I'm assuming it's a similar case.

Since you're here though, let me ask: Was the multiplication assembly (and possibly the C version of it in another file using int128) only intended for public keys?

You have to distinguish between:

1) field operations (mod p, space of coordinates x and y):

https://github.com/bitcoin-core/secp256k1/tree/master/src/ all files with name : field*

and

2) scalar operations (mod n, space of private keys):

https://github.com/bitcoin-core/secp256k1/tree/master/src/ all files with name : scalar*

If you want to multiply 2 private keys (mod n):

you could use secp256k1_scalar_mul_512 (256bit * 256 bit -> 512 bit) (assembly multiplication)

https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h#L587

Code:

inline void secp256k1_scalar_mul_512(uint64_t* l, const uint64_t *a,  const uint64_t *b) {
    
    //uint64_t l[8];
    
    __asm__ __volatile__ (
    /* Preload */
    "movq 0(%%rdi), %%r15\n"
    "movq 8(%%rdi), %%rbx\n"
    "movq 16(%%rdi), %%rcx\n"
    "movq 0(%%rdx), %%r11\n"
    "movq 8(%%rdx), %%r12\n"
    "movq 16(%%rdx), %%r13\n"
    "movq 24(%%rdx), %%r14\n"
    /* (rax,rdx) = a0 * b0 */
    "movq %%r15, %%rax\n"
    "mulq %%r11\n"
    /* Extract l0 */
    "movq %%rax, 0(%%rsi)\n"
    /* (r8,r9,r10) = (rdx) */
    "movq %%rdx, %%r8\n"
    "xorq %%r9, %%r9\n"
    "xorq %%r10, %%r10\n"
    /* (r8,r9,r10) += a0 * b1 */
    "movq %%r15, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* (r8,r9,r10) += a1 * b0 */
    "movq %%rbx, %%rax\n"
    "mulq %%r11\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* Extract l1 */
    "movq %%r8, 8(%%rsi)\n"
    "xorq %%r8, %%r8\n"
    /* (r9,r10,r8) += a0 * b2 */
    "movq %%r15, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* (r9,r10,r8) += a1 * b1 */
    "movq %%rbx, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* (r9,r10,r8) += a2 * b0 */
    "movq %%rcx, %%rax\n"
    "mulq %%r11\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* Extract l2 */
    "movq %%r9, 16(%%rsi)\n"
    "xorq %%r9, %%r9\n"
    /* (r10,r8,r9) += a0 * b3 */
    "movq %%r15, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* Preload a3 */
    "movq 24(%%rdi), %%r15\n"
    /* (r10,r8,r9) += a1 * b2 */
    "movq %%rbx, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* (r10,r8,r9) += a2 * b1 */
    "movq %%rcx, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* (r10,r8,r9) += a3 * b0 */
    "movq %%r15, %%rax\n"
    "mulq %%r11\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    "adcq $0, %%r9\n"
    /* Extract l3 */
    "movq %%r10, 24(%%rsi)\n"
    "xorq %%r10, %%r10\n"
    /* (r8,r9,r10) += a1 * b3 */
    "movq %%rbx, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* (r8,r9,r10) += a2 * b2 */
    "movq %%rcx, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* (r8,r9,r10) += a3 * b1 */
    "movq %%r15, %%rax\n"
    "mulq %%r12\n"
    "addq %%rax, %%r8\n"
    "adcq %%rdx, %%r9\n"
    "adcq $0, %%r10\n"
    /* Extract l4 */
    "movq %%r8, 32(%%rsi)\n"
    "xorq %%r8, %%r8\n"
    /* (r9,r10,r8) += a2 * b3 */
    "movq %%rcx, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* (r9,r10,r8) += a3 * b2 */
    "movq %%r15, %%rax\n"
    "mulq %%r13\n"
    "addq %%rax, %%r9\n"
    "adcq %%rdx, %%r10\n"
    "adcq $0, %%r8\n"
    /* Extract l5 */
    "movq %%r9, 40(%%rsi)\n"
    /* (r10,r8) += a3 * b3 */
    "movq %%r15, %%rax\n"
    "mulq %%r14\n"
    "addq %%rax, %%r10\n"
    "adcq %%rdx, %%r8\n"
    /* Extract l6 */
    "movq %%r10, 48(%%rsi)\n"
    /* Extract l7 */
    "movq %%r8, 56(%%rsi)\n"
    : "+d"(b)
    : "S"(l), "D"(a)
    : "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
}

where for example

Code:

const uint64_t  a[4] =  {0x59f2815b16f81798, 0x029bfcdb2dce28d9, 0x55a06295ce870b07, 0x79be667ef9dcbbac};
const uint64_t  b[4] =  {0x9c47d08ffb10d4b8, 0xfd17b448a6855419, 0x5da4fbfc0e1108a8, 0x483ada7726a3c465};

uint64_t c[8] = {0};

secp256k1_scalar_mul_512(c, a, b);

printf("%016lx%016lx%016lx%016lx%016lx%016lx%016lx%016lx\n", c[7], c[6], c[5], c[4], c[3], c[2], c[1], c[0]);

Result: 
225989dbbc349b6f319ca3eed777a46f55b1dc22e97af11261167d213c1f060d29520a21508989b06ed1194129efb1517cee385a708abe44718bc509775ad540

and then apply the function secp256k1_scalar_reduce_512 (from 512 bit to 256 bit mod n)

https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h#L274

to get the result mod n

https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h#L761-L765

Code:

805714a252d0c0b58910907e85b5b801fff610a36bdf46847a4bf5d9ae2d10ed

I got the same results with my function, with libsecp256k1 functions and with python3 too:

Code:

>>> n=0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141
>>> a=0x79be667ef9dcbbac55a06295ce870b07029bfcdb2dce28d959f2815b16f81798
>>> b=0x483ada7726a3c4655da4fbfc0e1108a8fd17b448a68554199c47d08ffb10d4b8
>>> hex(a*b)
'0x225989dbbc349b6f319ca3eed777a46f55b1dc22e97af11261167d213c1f060d29520a21508989b06ed1194129efb1517cee385a708abe44718bc509775ad540'
>>> hex(a*b % n)
'0x805714a252d0c0b58910907e85b5b801fff610a36bdf46847a4bf5d9ae2d10ed'