XMieLgNRvyra45MMZg
Newbie
Offline
Activity: 14
Merit: 0
|
 |
Today at 07:08:05 PM Last edit: Today at 07:36:22 PM by XMieLgNRvyra45MMZg |
|
Sorry guys, isn't for Kangaroo you need public key?  This is for a totally different thing but maybe useful for some of you: SECP256K1_GPU.h/* * secp256k1 GPU Implementation for PRNG Attack * * Optimized scalar multiplication for CUDA * Uses Montgomery representation for field arithmetic */
#ifndef SECP256K1_GPU_H #define SECP256K1_GPU_H
#include <cuda_runtime.h> #include <stdint.h>
// secp256k1 prime: p = 2^256 - 2^32 - 977 __constant__ uint32_t SECP256K1_P[8] = { 0xFFFFFC2F, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
// secp256k1 order: n __constant__ uint32_t SECP256K1_N[8] = { 0xD0364141, 0xBFD25E8C, 0xAF48A03B, 0xBAAEDCE6, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
// Generator point G.x __constant__ uint32_t SECP256K1_GX[8] = { 0x16F81798, 0x59F2815B, 0x2DCE28D9, 0x029BFCDB, 0xCE870B07, 0x55A06295, 0xF9DCBBAC, 0x79BE667E };
// Generator point G.y __constant__ uint32_t SECP256K1_GY[8] = { 0xFB10D4B8, 0x9C47D08F, 0xA6855419, 0xFD17B448, 0x0E1108A8, 0x5DA4FBFC, 0x26A3C465, 0x483ADA77 };
// Field element (256-bit) typedef struct { uint32_t d[8]; } fe_t;
// Point in Jacobian coordinates typedef struct { fe_t x; fe_t y; fe_t z; } jpoint_t;
// Point in affine coordinates typedef struct { fe_t x; fe_t y; } apoint_t;
// ============================================================================ // 256-bit Modular Arithmetic (mod p) // ============================================================================
__device__ __forceinline__ void fe_zero(fe_t *r) { #pragma unroll for (int i = 0; i < 8; i++) r->d[i] = 0; }
__device__ __forceinline__ void fe_one(fe_t *r) { r->d[0] = 1; #pragma unroll for (int i = 1; i < 8; i++) r->d[i] = 0; }
__device__ __forceinline__ void fe_copy(fe_t *r, const fe_t *a) { #pragma unroll for (int i = 0; i < 8; i++) r->d[i] = a->d[i]; }
__device__ __forceinline__ int fe_is_zero(const fe_t *a) { uint32_t x = 0; #pragma unroll for (int i = 0; i < 8; i++) x |= a->d[i]; return x == 0; }
__device__ __forceinline__ int fe_cmp(const fe_t *a, const fe_t *b) { for (int i = 7; i >= 0; i--) { if (a->d[i] < b->d[i]) return -1; if (a->d[i] > b->d[i]) return 1; } return 0; }
// r = a + b (mod p) __device__ void fe_add(fe_t *r, const fe_t *a, const fe_t *b) { uint64_t c = 0; #pragma unroll for (int i = 0; i < 8; i++) { c += (uint64_t)a->d[i] + b->d[i]; r->d[i] = (uint32_t)c; c >>= 32; } // Reduce if >= p if (c || (r->d[7] > SECP256K1_P[7]) || (r->d[7] == SECP256K1_P[7] && r->d[0] >= SECP256K1_P[0])) { c = 0; #pragma unroll for (int i = 0; i < 8; i++) { c = (uint64_t)r->d[i] - SECP256K1_P[i] - (c >> 63); r->d[i] = (uint32_t)c; } } }
// r = a - b (mod p) __device__ void fe_sub(fe_t *r, const fe_t *a, const fe_t *b) { int64_t c = 0; #pragma unroll for (int i = 0; i < 8; i++) { c += (int64_t)a->d[i] - b->d[i]; r->d[i] = (uint32_t)c; c >>= 32; } // If underflow, add p if (c < 0) { c = 0; #pragma unroll for (int i = 0; i < 8; i++) { c += (uint64_t)r->d[i] + SECP256K1_P[i]; r->d[i] = (uint32_t)c; c >>= 32; } } }
// r = a * b (mod p) - using secp256k1's special prime for fast reduction __device__ void fe_mul(fe_t *r, const fe_t *a, const fe_t *b) { uint64_t product[16] = {0}; // Full 512-bit multiplication #pragma unroll for (int i = 0; i < 8; i++) { uint64_t carry = 0; #pragma unroll for (int j = 0; j < 8; j++) { uint64_t t = product[i + j] + (uint64_t)a->d[i] * b->d[j] + carry; product[i + j] = t & 0xFFFFFFFF; carry = t >> 32; } product[i + 8] += carry; } // Fast reduction for p = 2^256 - 2^32 - 977 // High 256 bits * (2^32 + 977) and add to low 256 bits uint64_t c = 0; #pragma unroll for (int i = 0; i < 8; i++) { c += product[i]; c += product[i + 8] * 977; if (i < 7) c += product[i + 8 + 1] << 32; r->d[i] = (uint32_t)c; c >>= 32; } // Handle remaining carries while (c) { uint64_t d = 0; d += r->d[0] + c * 977; r->d[0] = (uint32_t)d; d >>= 32; d += r->d[1] + c; r->d[1] = (uint32_t)d; c = d >> 32; for (int i = 2; i < 8 && c; i++) { d = (uint64_t)r->d[i] + c; r->d[i] = (uint32_t)d; c = d >> 32; } } // Final reduction if >= p fe_t p_val; #pragma unroll for (int i = 0; i < 8; i++) p_val.d[i] = SECP256K1_P[i]; if (fe_cmp(r, &p_val) >= 0) { fe_sub(r, r, &p_val); } }
// r = a^2 (mod p) - optimized squaring __device__ void fe_sqr(fe_t *r, const fe_t *a) { fe_mul(r, a, a); // Can be optimized further }
// r = a^(-1) (mod p) using Fermat's little theorem: a^(p-2) mod p __device__ void fe_inv(fe_t *r, const fe_t *a) { fe_t x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t; fe_sqr(&x2, a); fe_mul(&x2, &x2, a); // x2 = a^3 fe_sqr(&x3, &x2); fe_mul(&x3, &x3, a); // x3 = a^7 fe_copy(&x6, &x3); for (int i = 0; i < 3; i++) fe_sqr(&x6, &x6); fe_mul(&x6, &x6, &x3); fe_copy(&x9, &x6); for (int i = 0; i < 3; i++) fe_sqr(&x9, &x9); fe_mul(&x9, &x9, &x3); fe_copy(&x11, &x9); for (int i = 0; i < 2; i++) fe_sqr(&x11, &x11); fe_mul(&x11, &x11, &x2); fe_copy(&x22, &x11); for (int i = 0; i < 11; i++) fe_sqr(&x22, &x22); fe_mul(&x22, &x22, &x11); fe_copy(&x44, &x22); for (int i = 0; i < 22; i++) fe_sqr(&x44, &x44); fe_mul(&x44, &x44, &x22); fe_copy(&x88, &x44); for (int i = 0; i < 44; i++) fe_sqr(&x88, &x88); fe_mul(&x88, &x88, &x44); fe_copy(&x176, &x88); for (int i = 0; i < 88; i++) fe_sqr(&x176, &x176); fe_mul(&x176, &x176, &x88); fe_copy(&x220, &x176); for (int i = 0; i < 44; i++) fe_sqr(&x220, &x220); fe_mul(&x220, &x220, &x44); fe_copy(&x223, &x220); for (int i = 0; i < 3; i++) fe_sqr(&x223, &x223); fe_mul(&x223, &x223, &x3); fe_copy(&t, &x223); for (int i = 0; i < 23; i++) fe_sqr(&t, &t); fe_mul(&t, &t, &x22); for (int i = 0; i < 6; i++) fe_sqr(&t, &t); fe_mul(&t, &t, &x2); fe_sqr(&t, &t); fe_sqr(&t, &t); fe_mul(r, &t, a); }
// ============================================================================ // Jacobian Point Operations // ============================================================================
__device__ void jpoint_set_infinity(jpoint_t *p) { fe_zero(&p->x); fe_zero(&p->y); fe_zero(&p->z); }
__device__ int jpoint_is_infinity(const jpoint_t *p) { return fe_is_zero(&p->z); }
// Convert affine to Jacobian __device__ void jpoint_from_affine(jpoint_t *r, const apoint_t *a) { fe_copy(&r->x, &a->x); fe_copy(&r->y, &a->y); fe_one(&r->z); }
// Convert Jacobian to affine __device__ void jpoint_to_affine(apoint_t *r, const jpoint_t *p) { if (jpoint_is_infinity(p)) { fe_zero(&r->x); fe_zero(&r->y); return; } fe_t z_inv, z_inv2, z_inv3; fe_inv(&z_inv, &p->z); fe_sqr(&z_inv2, &z_inv); fe_mul(&z_inv3, &z_inv2, &z_inv); fe_mul(&r->x, &p->x, &z_inv2); fe_mul(&r->y, &p->y, &z_inv3); }
// Point doubling: r = 2 * p (Jacobian coordinates) __device__ void jpoint_double(jpoint_t *r, const jpoint_t *p) { if (jpoint_is_infinity(p) || fe_is_zero(&p->y)) { jpoint_set_infinity(r); return; } fe_t s, m, x3, y3, z3, t1, t2; // S = 4 * X * Y^2 fe_sqr(&t1, &p->y); // t1 = Y^2 fe_mul(&s, &p->x, &t1); // s = X * Y^2 fe_add(&s, &s, &s); // s = 2 * X * Y^2 fe_add(&s, &s, &s); // s = 4 * X * Y^2 // M = 3 * X^2 (a = 0 for secp256k1) fe_sqr(&m, &p->x); // m = X^2 fe_add(&t2, &m, &m); // t2 = 2 * X^2 fe_add(&m, &t2, &m); // m = 3 * X^2 // X3 = M^2 - 2*S fe_sqr(&x3, &m); // x3 = M^2 fe_sub(&x3, &x3, &s); // x3 = M^2 - S fe_sub(&x3, &x3, &s); // x3 = M^2 - 2*S // Y3 = M * (S - X3) - 8 * Y^4 fe_sub(&t2, &s, &x3); // t2 = S - X3 fe_mul(&y3, &m, &t2); // y3 = M * (S - X3) fe_sqr(&t2, &t1); // t2 = Y^4 fe_add(&t2, &t2, &t2); // t2 = 2 * Y^4 fe_add(&t2, &t2, &t2); // t2 = 4 * Y^4 fe_add(&t2, &t2, &t2); // t2 = 8 * Y^4 fe_sub(&y3, &y3, &t2); // Z3 = 2 * Y * Z fe_mul(&z3, &p->y, &p->z); fe_add(&z3, &z3, &z3); fe_copy(&r->x, &x3); fe_copy(&r->y, &y3); fe_copy(&r->z, &z3); }
// Point addition: r = p + q (mixed: p Jacobian, q affine) __device__ void jpoint_add_mixed(jpoint_t *r, const jpoint_t *p, const apoint_t *q) { if (jpoint_is_infinity(p)) { jpoint_from_affine(r, q); return; } fe_t z2, u2, s2, h, hh, i, j, rr, v, t1; // Z^2 fe_sqr(&z2, &p->z); // U2 = X2 * Z1^2 fe_mul(&u2, &q->x, &z2); // S2 = Y2 * Z1^3 fe_mul(&s2, &z2, &p->z); fe_mul(&s2, &s2, &q->y); // H = U2 - X1 fe_sub(&h, &u2, &p->x); // Check if same point (need doubling) if (fe_is_zero(&h)) { fe_sub(&t1, &s2, &p->y); if (fe_is_zero(&t1)) { // Same point - double jpoint_double(r, p); return; } // Point at infinity jpoint_set_infinity(r); return; } // HH = H^2 fe_sqr(&hh, &h); // I = 4 * HH fe_add(&i, &hh, &hh); fe_add(&i, &i, &i); // J = H * I fe_mul(&j, &h, &i); // r = 2 * (S2 - Y1) fe_sub(&rr, &s2, &p->y); fe_add(&rr, &rr, &rr); // V = X1 * I fe_mul(&v, &p->x, &i); // X3 = r^2 - J - 2*V fe_sqr(&r->x, &rr); fe_sub(&r->x, &r->x, &j); fe_sub(&r->x, &r->x, &v); fe_sub(&r->x, &r->x, &v); // Y3 = r * (V - X3) - 2 * Y1 * J fe_sub(&t1, &v, &r->x); fe_mul(&r->y, &rr, &t1); fe_mul(&t1, &p->y, &j); fe_add(&t1, &t1, &t1); fe_sub(&r->y, &r->y, &t1); // Z3 = 2 * Z1 * H fe_mul(&r->z, &p->z, &h); fe_add(&r->z, &r->z, &r->z); }
// ============================================================================ // Scalar Multiplication // ============================================================================
// Get generator point __device__ void get_generator(apoint_t *g) { #pragma unroll for (int i = 0; i < 8; i++) { g->x.d[i] = SECP256K1_GX[i]; g->y.d[i] = SECP256K1_GY[i]; } }
// Scalar multiplication: r = k * G __device__ void scalar_mult_base(apoint_t *r, const uint8_t *k) { apoint_t G; jpoint_t R; get_generator(&G); jpoint_set_infinity(&R); // Simple double-and-add (can be optimized with precomputation) for (int i = 255; i >= 0; i--) { jpoint_double(&R, &R); int byte_idx = i / 8; int bit_idx = i % 8; if ((k[31 - byte_idx] >> bit_idx) & 1) { jpoint_add_mixed(&R, &R, &G); } } jpoint_to_affine(r, &R); }
// ============================================================================ // Public Key Computation and Comparison // ============================================================================
__device__ bool compute_pubkey_and_check(const uint8_t *priv_key, const uint8_t *target_x) { apoint_t pub; scalar_mult_base(&pub, priv_key); // Compare X coordinate (little-endian) for (int i = 0; i < 8; i++) { uint32_t target_word = ((uint32_t)target_x[i*4+3] << 24) | ((uint32_t)target_x[i*4+2] << 16) | ((uint32_t)target_x[i*4+1] << 8) | ((uint32_t)target_x[i*4]); if (pub.x.d[i] != target_word) return false; } return true; }
#endif // SECP256K1_GPU_H
SECP256K1_FAST.CUH// secp256k1_fast.cuh - Highly optimized GPU secp256k1 implementation // Features: // - 8x32-bit limb field arithmetic // - Jacobian coordinates (no inversions during computation) // - Precomputed tables for generator G (16 points, 4-bit windows) // - Windowed scalar multiplication // - Fast reduction using secp256k1 special form: p = 2^256 - 2^32 - 977
#pragma once #include <stdint.h> #include <cuda_runtime.h>
// ============================================================================ // FIELD ELEMENT (256-bit mod p) // p = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F // ============================================================================
typedef struct { uint32_t d[8]; // Little-endian: d[0] is LSW } fe_t;
// Curve order n // n = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141
// ============================================================================ // JACOBIAN POINT (X, Y, Z) where affine (x,y) = (X/Z^2, Y/Z^3) // ============================================================================
typedef struct { fe_t x, y, z; } jac_point_t;
// ============================================================================ // CONSTANTS - Generator point and precomputed table // ============================================================================
// Generator G (affine) __device__ __constant__ uint32_t G_X[8] = { 0x16F81798, 0x59F2815B, 0x2DCE28D9, 0x029BFCDB, 0xCE870B07, 0x55A06295, 0xF9DCBBAC, 0x79BE667E };
__device__ __constant__ uint32_t G_Y[8] = { 0xFB10D4B8, 0x9C47D08F, 0xA6855419, 0xFD17B448, 0x0E1108A8, 0x5DA4FBFC, 0x26A3C465, 0x483ADA77 };
// Prime p = 2^256 - 2^32 - 977 __device__ __constant__ uint32_t P[8] = { 0xFFFFFC2F, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
// 2*P for easier reduction __device__ __constant__ uint32_t P2[9] = { 0xFFFF85E, 0xFFFFFFFD, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x01 };
// Precomputed table: G_TABLE[i] = (i+1)*G for i = 0..15 (Jacobian with Z=1) // These will be computed at init time __device__ fe_t G_TABLE_X[16]; __device__ fe_t G_TABLE_Y[16];
// ============================================================================ // FIELD ARITHMETIC // ============================================================================
// Zero __device__ __forceinline__ void fe_zero(fe_t *r) { #pragma unroll for (int i = 0; i < 8; i++) r->d[i] = 0; }
// Copy __device__ __forceinline__ void fe_copy(fe_t *r, const fe_t *a) { #pragma unroll for (int i = 0; i < 8; i++) r->d[i] = a->d[i]; }
// Set from uint32_t array (big-endian input to little-endian internal) __device__ __forceinline__ void fe_set_be(fe_t *r, const uint8_t *be32) { #pragma unroll for (int i = 0; i < 8; i++) { int j = 7 - i; r->d[i] = ((uint32_t)be32[j*4] << 24) | ((uint32_t)be32[j*4+1] << 16) | ((uint32_t)be32[j*4+2] << 8) | (uint32_t)be32[j*4+3]; } }
// Get as big-endian byte array __device__ __forceinline__ void fe_get_be(uint8_t *be32, const fe_t *a) { #pragma unroll for (int i = 0; i < 8; i++) { int j = 7 - i; be32[j*4] = (a->d[i] >> 24) & 0xff; be32[j*4+1] = (a->d[i] >> 16) & 0xff; be32[j*4+2] = (a->d[i] >> 8) & 0xff; be32[j*4+3] = a->d[i] & 0xff; } }
// Compare: returns -1 if a<b, 0 if a==b, 1 if a>b __device__ __forceinline__ int fe_cmp(const fe_t *a, const fe_t *b) { #pragma unroll for (int i = 7; i >= 0; i--) { if (a->d[i] > b->d[i]) return 1; if (a->d[i] < b->d[i]) return -1; } return 0; }
// Is zero? __device__ __forceinline__ int fe_is_zero(const fe_t *a) { uint32_t z = 0; #pragma unroll for (int i = 0; i < 8; i++) z |= a->d[i]; return z == 0; }
// Addition: r = a + b (without reduction) __device__ __forceinline__ uint32_t fe_add_raw(fe_t *r, const fe_t *a, const fe_t *b) { uint64_t c = 0; #pragma unroll for (int i = 0; i < 8; i++) { c += (uint64_t)a->d[i] + b->d[i]; r->d[i] = (uint32_t)c; c >>= 32; } return (uint32_t)c; }
// Subtraction: r = a - b, returns borrow __device__ __forceinline__ uint32_t fe_sub_raw(fe_t *r, const fe_t *a, const fe_t *b) { int64_t c = 0; #pragma unroll for (int i = 0; i < 8; i++) { c += (int64_t)a->d[i] - b->d[i]; r->d[i] = (uint32_t)c; c >>= 32; } return (uint32_t)(c & 1); }
// Reduce mod p - assuming r < 2*p __device__ __forceinline__ void fe_reduce_once(fe_t *r) { fe_t p_val; #pragma unroll for (int i = 0; i < 8; i++) p_val.d[i] = P[i]; fe_t tmp; uint32_t borrow = fe_sub_raw(&tmp, r, &p_val); // If no borrow, use reduced value if (!borrow) { fe_copy(r, &tmp); } }
// Full reduction mod p __device__ __forceinline__ void fe_reduce(fe_t *r) { fe_reduce_once(r); fe_reduce_once(r); }
// Add mod p: r = a + b mod p __device__ __forceinline__ void fe_add(fe_t *r, const fe_t *a, const fe_t *b) { uint32_t carry = fe_add_raw(r, a, b); // If carry or >= p, subtract p fe_t p_val; #pragma unroll for (int i = 0; i < 8; i++) p_val.d[i] = P[i]; if (carry || fe_cmp(r, &p_val) >= 0) { fe_sub_raw(r, r, &p_val); } }
// Sub mod p: r = a - b mod p __device__ __forceinline__ void fe_sub(fe_t *r, const fe_t *a, const fe_t *b) { uint32_t borrow = fe_sub_raw(r, a, b); // If borrow, add p if (borrow) { fe_t p_val; #pragma unroll for (int i = 0; i < 8; i++) p_val.d[i] = P[i]; fe_add_raw(r, r, &p_val); } }
// Negate mod p: r = -a mod p = p - a __device__ __forceinline__ void fe_neg(fe_t *r, const fe_t *a) { fe_t p_val; #pragma unroll for (int i = 0; i < 8; i++) p_val.d[i] = P[i]; fe_sub_raw(r, &p_val, a); }
// Multiply mod p using secp256k1 special reduction // p = 2^256 - 2^32 - 977 // For r = a*b mod p: // Let a*b = H * 2^256 + L (H is high 256 bits, L is low 256 bits) // r = L + H * (2^32 + 977) mod p __device__ void fe_mul(fe_t *r, const fe_t *a, const fe_t *b) { uint64_t product[16]; // Full 256x256 -> 512 bit multiplication #pragma unroll for (int i = 0; i < 16; i++) product[i] = 0; #pragma unroll for (int i = 0; i < 8; i++) { uint64_t carry = 0; #pragma unroll for (int j = 0; j < 8; j++) { uint64_t prod = (uint64_t)a->d[i] * b->d[j] + product[i+j] + carry; product[i+j] = prod & 0xFFFFFFFF; carry = prod >> 32; } product[i+8] = carry; } // Reduction: r = (low 256) + (high 256) * (2^32 + 977) // We do this iteratively uint64_t c = 0; #pragma unroll for (int i = 0; i < 8; i++) { c += product[i]; // Add high[i] * 977 c += product[i + 8] * 977ULL; // Add high[i-1] * 2^32 (which is high[i-1] shifted left) if (i > 0) c += product[i + 7] << 32; r->d[i] = (uint32_t)c; c >>= 32; } // Handle remaining carries c += product[15] << 32; // Last high word shifted // If there's still carry, add back (carry * (2^32 + 977)) while (c) { uint64_t c2 = 0; #pragma unroll for (int i = 0; i < 8; i++) { if (i == 0) c2 += (uint64_t)r->d[i] + (c & 0xFFFFFFFF) * 977ULL; else if (i == 1) c2 += (uint64_t)r->d[i] + ((c & 0xFFFFFFFF) << 32) + ((c >> 32) * 977ULL); else if (i == 2 && c > 0xFFFFFFFF) c2 += (uint64_t)r->d[i] + (c >> 32); else c2 += (uint64_t)r->d[i]; r->d[i] = (uint32_t)c2; c2 >>= 32; } c = c2; } fe_reduce(r); }
// Square mod p (can be optimized but using mul for now) __device__ __forceinline__ void fe_sqr(fe_t *r, const fe_t *a) { fe_mul(r, a, a); }
// Double: r = 2*a mod p __device__ __forceinline__ void fe_dbl(fe_t *r, const fe_t *a) { fe_add(r, a, a); }
// Modular inverse using Fermat's little theorem: a^(-1) = a^(p-2) mod p // This is slow but simple. For batch, use Montgomery's trick __device__ void fe_inv(fe_t *r, const fe_t *a) { // p-2 = 0xFFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2D fe_t t1, t2, t3; fe_sqr(&t1, a); // a^2 fe_mul(&t1, &t1, a); // a^3 fe_sqr(&t2, &t1); // a^6 fe_sqr(&t2, &t2); // a^12 fe_mul(&t2, &t2, &t1); // a^15 fe_sqr(&t3, &t2); // a^30 // Build up to a^(2^32-1) using repeated squaring #pragma unroll for (int i = 0; i < 4; i++) fe_sqr(&t3, &t3); fe_mul(&t3, &t3, &t2); // a^255 fe_copy(&t2, &t3); #pragma unroll for (int i = 0; i < 8; i++) fe_sqr(&t2, &t2); fe_mul(&t2, &t2, &t3); // a^(2^16-1) fe_copy(&t3, &t2); #pragma unroll for (int i = 0; i < 16; i++) fe_sqr(&t3, &t3); fe_mul(&t3, &t3, &t2); // a^(2^32-1) // Now build a^(p-2) fe_copy(&t2, &t3); #pragma unroll for (int i = 0; i < 32; i++) fe_sqr(&t2, &t2); fe_mul(&t2, &t2, &t3); // a^(2^64-1) fe_copy(&t1, &t2); #pragma unroll for (int i = 0; i < 64; i++) fe_sqr(&t1, &t1); fe_mul(&t1, &t1, &t2); // a^(2^128-1) fe_copy(&t2, &t1); #pragma unroll for (int i = 0; i < 128; i++) fe_sqr(&t2, &t2); fe_mul(&t2, &t2, &t1); // a^(2^256-1) // Adjust for p-2 = 2^256 - 2^32 - 979 // We have a^(2^256-1), need a^(2^256 - 2^32 - 979) // Actually easier: compute via standard addition chain // Let's use a simpler but slower method // Simple version: a^(p-2) via binary method fe_copy(&t1, a); fe_copy(r, a); // p-2 bits (from bit 1 to 255, bit 0 is 1) // Hard-coded for secp256k1: exponent is FFFFFFFF...FFFFFFFEFFFFFC2D // Bottom 32 bits: FFFFFC2D = 11111111111111111111110000101101 // Bit 1-7: all 1 #pragma unroll for (int i = 0; i < 7; i++) { fe_sqr(r, r); fe_mul(r, r, a); } // Bit 8-10: 101 fe_sqr(r, r); fe_mul(r, r, a); // 1 fe_sqr(r, r); // 0 fe_sqr(r, r); fe_mul(r, r, a); // 1 // Bit 11: 1 fe_sqr(r, r); fe_mul(r, r, a); // Bit 12-31: 00001111111111111111 = many zeros then ones #pragma unroll for (int i = 0; i < 4; i++) fe_sqr(r, r); #pragma unroll for (int i = 0; i < 16; i++) { fe_sqr(r, r); fe_mul(r, r, a); } // Bit 32-255: 1111...1110 (223 ones, then 0) #pragma unroll for (int i = 0; i < 223; i++) { fe_sqr(r, r); fe_mul(r, r, a); } fe_sqr(r, r); // final 0 }
// ============================================================================ // POINT OPERATIONS (Jacobian coordinates) // ============================================================================
// Initialize as point at infinity __device__ __forceinline__ void jac_set_inf(jac_point_t *p) { fe_zero(&p->x); fe_zero(&p->y); fe_zero(&p->z); p->y.d[0] = 1; // Convention: infinity has Y=1, Z=0 }
// Check if point is at infinity (Z == 0) __device__ __forceinline__ int jac_is_inf(const jac_point_t *p) { return fe_is_zero(&p->z); }
// Set from affine coordinates (x, y) -> (X=x, Y=y, Z=1) __device__ __forceinline__ void jac_set_affine(jac_point_t *p, const fe_t *x, const fe_t *y) { fe_copy(&p->x, x); fe_copy(&p->y, y); fe_zero(&p->z); p->z.d[0] = 1; }
// Copy point __device__ __forceinline__ void jac_copy(jac_point_t *r, const jac_point_t *p) { fe_copy(&r->x, &p->x); fe_copy(&r->y, &p->y); fe_copy(&r->z, &p->z); }
// Point doubling: r = 2*p (Jacobian) // Using "dbl-2001-b" formula: 1M + 5S + 1*a + 7add + 1*4 + 2*8 // Since a=0 for secp256k1: 1M + 5S + 6add + 1*4 + 2*8 __device__ void jac_double(jac_point_t *r, const jac_point_t *p) { if (jac_is_inf(p)) { jac_set_inf(r); return; } fe_t s, m, x3, y3, z3, t1, t2; // S = 4*X*Y^2 fe_sqr(&t1, &p->y); // Y^2 fe_mul(&s, &p->x, &t1); // X*Y^2 fe_dbl(&s, &s); // 2*X*Y^2 fe_dbl(&s, &s); // 4*X*Y^2 // M = 3*X^2 + a*Z^4 (a=0 for secp256k1) fe_sqr(&m, &p->x); // X^2 fe_dbl(&t2, &m); // 2*X^2 fe_add(&m, &m, &t2); // 3*X^2 // X3 = M^2 - 2*S fe_sqr(&x3, &m); // M^2 fe_dbl(&t2, &s); // 2*S fe_sub(&x3, &x3, &t2); // M^2 - 2*S // Y3 = M*(S - X3) - 8*Y^4 fe_sub(&t2, &s, &x3); // S - X3 fe_mul(&y3, &m, &t2); // M*(S - X3) fe_sqr(&t2, &t1); // Y^4 fe_dbl(&t2, &t2); // 2*Y^4 fe_dbl(&t2, &t2); // 4*Y^4 fe_dbl(&t2, &t2); // 8*Y^4 fe_sub(&y3, &y3, &t2); // M*(S-X3) - 8*Y^4 // Z3 = 2*Y*Z fe_mul(&z3, &p->y, &p->z); // Y*Z fe_dbl(&z3, &z3); // 2*Y*Z fe_copy(&r->x, &x3); fe_copy(&r->y, &y3); fe_copy(&r->z, &z3); }
// Point addition: r = p + q (Jacobian + Jacobian) // Using "add-2007-bl" formula: 11M + 5S + 9add + 4*2 __device__ void jac_add(jac_point_t *r, const jac_point_t *p, const jac_point_t *q) { if (jac_is_inf(p)) { jac_copy(r, q); return; } if (jac_is_inf(q)) { jac_copy(r, p); return; } fe_t z1z1, z2z2, u1, u2, s1, s2, h, i, j, rr, v; fe_t x3, y3, z3, t1, t2; fe_sqr(&z1z1, &p->z); // Z1^2 fe_sqr(&z2z2, &q->z); // Z2^2 fe_mul(&u1, &p->x, &z2z2); // U1 = X1*Z2^2 fe_mul(&u2, &q->x, &z1z1); // U2 = X2*Z1^2 fe_mul(&t1, &q->z, &z2z2); // Z2^3 fe_mul(&s1, &p->y, &t1); // S1 = Y1*Z2^3 fe_mul(&t1, &p->z, &z1z1); // Z1^3 fe_mul(&s2, &q->y, &t1); // S2 = Y2*Z1^3 fe_sub(&h, &u2, &u1); // H = U2 - U1 // If H=0, points have same X, check if same point or inverse if (fe_is_zero(&h)) { fe_sub(&t1, &s2, &s1); if (fe_is_zero(&t1)) { jac_double(r, p); // Same point return; } else { jac_set_inf(r); // Inverse points return; } } fe_dbl(&i, &h); // 2*H fe_sqr(&i, &i); // I = (2*H)^2 fe_mul(&j, &h, &i); // J = H*I fe_sub(&rr, &s2, &s1); // S2 - S1 fe_dbl(&rr, &rr); // r = 2*(S2 - S1) fe_mul(&v, &u1, &i); // V = U1*I // X3 = r^2 - J - 2*V fe_sqr(&x3, &rr); // r^2 fe_sub(&x3, &x3, &j); // r^2 - J fe_dbl(&t1, &v); // 2*V fe_sub(&x3, &x3, &t1); // r^2 - J - 2*V // Y3 = r*(V - X3) - 2*S1*J fe_sub(&t1, &v, &x3); // V - X3 fe_mul(&y3, &rr, &t1); // r*(V - X3) fe_mul(&t1, &s1, &j); // S1*J fe_dbl(&t1, &t1); // 2*S1*J fe_sub(&y3, &y3, &t1); // r*(V-X3) - 2*S1*J // Z3 = ((Z1+Z2)^2 - Z1^2 - Z2^2)*H fe_add(&t1, &p->z, &q->z); // Z1+Z2 fe_sqr(&t1, &t1); // (Z1+Z2)^2 fe_sub(&t1, &t1, &z1z1); // (Z1+Z2)^2 - Z1^2 fe_sub(&t1, &t1, &z2z2); // (Z1+Z2)^2 - Z1^2 - Z2^2 fe_mul(&z3, &t1, &h); // Z3 fe_copy(&r->x, &x3); fe_copy(&r->y, &y3); fe_copy(&r->z, &z3); }
// Mixed addition: r = p (Jacobian) + q (affine, Z=1) // More efficient: 7M + 4S + 9add + 3*2 + 1*4 __device__ void jac_add_affine(jac_point_t *r, const jac_point_t *p, const fe_t *qx, const fe_t *qy) { if (jac_is_inf(p)) { fe_copy(&r->x, qx); fe_copy(&r->y, qy); fe_zero(&r->z); r->z.d[0] = 1; return; } fe_t z1z1, u2, s2, h, hh, i, j, rr, v; fe_t x3, y3, z3, t1; fe_sqr(&z1z1, &p->z); // Z1^2 // U1 = X1 (since Z2=1) fe_mul(&u2, qx, &z1z1); // U2 = X2*Z1^2 // S1 = Y1 (since Z2=1) fe_mul(&t1, &p->z, &z1z1); // Z1^3 fe_mul(&s2, qy, &t1); // S2 = Y2*Z1^3 fe_sub(&h, &u2, &p->x); // H = U2 - X1 if (fe_is_zero(&h)) { fe_sub(&t1, &s2, &p->y); if (fe_is_zero(&t1)) { jac_double(r, p); return; } else { jac_set_inf(r); return; } } fe_sqr(&hh, &h); // H^2 fe_dbl(&i, &hh); // 2*H^2 fe_dbl(&i, &i); // I = 4*H^2 fe_mul(&j, &h, &i); // J = H*I fe_sub(&rr, &s2, &p->y); // S2 - Y1 fe_dbl(&rr, &rr); // r = 2*(S2 - Y1) fe_mul(&v, &p->x, &i); // V = X1*I // X3 = r^2 - J - 2*V fe_sqr(&x3, &rr); fe_sub(&x3, &x3, &j); fe_dbl(&t1, &v); fe_sub(&x3, &x3, &t1); // Y3 = r*(V - X3) - 2*Y1*J fe_sub(&t1, &v, &x3); fe_mul(&y3, &rr, &t1); fe_mul(&t1, &p->y, &j); fe_dbl(&t1, &t1); fe_sub(&y3, &y3, &t1); // Z3 = 2*Z1*H fe_mul(&z3, &p->z, &h); fe_dbl(&z3, &z3); fe_copy(&r->x, &x3); fe_copy(&r->y, &y3); fe_copy(&r->z, &z3); }
// Convert Jacobian to affine: (X, Y, Z) -> (X/Z^2, Y/Z^3) __device__ void jac_to_affine(fe_t *rx, fe_t *ry, const jac_point_t *p) { if (jac_is_inf(p)) { fe_zero(rx); fe_zero(ry); return; } fe_t z_inv, z2_inv, z3_inv; fe_inv(&z_inv, &p->z); // 1/Z fe_sqr(&z2_inv, &z_inv); // 1/Z^2 fe_mul(&z3_inv, &z2_inv, &z_inv); // 1/Z^3 fe_mul(rx, &p->x, &z2_inv); // X/Z^2 fe_mul(ry, &p->y, &z3_inv); // Y/Z^3 }
// ============================================================================ // SCALAR MULTIPLICATION WITH PRECOMPUTED TABLE // ============================================================================
// Initialize precomputed table for generator G // G_TABLE[i] = (i+1)*G for i = 0..15 __global__ void init_g_table_kernel() { if (threadIdx.x != 0 || blockIdx.x != 0) return; // Load G fe_t gx, gy; #pragma unroll for (int i = 0; i < 8; i++) { gx.d[i] = G_X[i]; gy.d[i] = G_Y[i]; } // G_TABLE[0] = 1*G fe_copy(&G_TABLE_X[0], &gx); fe_copy(&G_TABLE_Y[0], &gy); // Compute 2*G, 3*G, ... 16*G jac_point_t acc; jac_set_affine(&acc, &gx, &gy); for (int i = 1; i < 16; i++) { jac_point_t next; jac_add_affine(&next, &acc, &gx, &gy); jac_copy(&acc, &next); // Convert to affine and store fe_t ax, ay; jac_to_affine(&ax, &ay, &acc); fe_copy(&G_TABLE_X[i], &ax); fe_copy(&G_TABLE_Y[i], &ay); } }
// Scalar multiplication using 4-bit windowed method with precomputed table // Computes r = k * G __device__ void scalar_mult_g(fe_t *rx, fe_t *ry, const uint8_t k[32]) { jac_point_t r; jac_set_inf(&r); // Process 4 bits at a time, MSB first // 256 bits = 64 nibbles for (int i = 63; i >= 0; i--) { // Double 4 times jac_double(&r, &r); jac_double(&r, &r); jac_double(&r, &r); jac_double(&r, &r); // Get 4-bit window int byte_idx = 31 - (i / 2); int nibble = (i & 1) ? (k[byte_idx] >> 4) : (k[byte_idx] & 0x0F); // Add G_TABLE[nibble-1] if nibble > 0 if (nibble > 0) { jac_add_affine(&r, &r, &G_TABLE_X[nibble - 1], &G_TABLE_Y[nibble - 1]); } } jac_to_affine(rx, ry, &r); }
// Simple scalar multiplication (double-and-add) for arbitrary point __device__ void scalar_mult(fe_t *rx, fe_t *ry, const uint8_t k[32], const fe_t *px, const fe_t *py) { jac_point_t r, p; jac_set_inf(&r); jac_set_affine(&p, px, py); // Process MSB to LSB for (int bit = 255; bit >= 0; bit--) { jac_double(&r, &r); int byte_idx = 31 - (bit / 8); int bit_idx = bit % 8; if (k[byte_idx] & (1 << bit_idx)) { jac_add_affine(&r, &r, px, py); } } jac_to_affine(rx, ry, &r); }
// ============================================================================ // BATCH OPERATIONS (for processing multiple keys efficiently) // ============================================================================
// Batch compute public keys for multiple private keys // Uses shared intermediate results where possible __device__ void batch_scalar_mult_g( fe_t *rx_arr, fe_t *ry_arr, // Output: array of public keys const uint8_t *k_arr, // Input: array of 32-byte private keys int count // Number of keys ) { // For now, just loop. More advanced batching could share doublings for (int i = 0; i < count; i++) { scalar_mult_g(&rx_arr[i], &ry_arr[i], k_arr + i * 32); } }
// ============================================================================ // HIGH-LEVEL API // ============================================================================
// Compute public key X coordinate from private key (for matching) __device__ void privkey_to_pubkey_x(uint8_t pubx[32], const uint8_t privkey[32]) { fe_t rx, ry; scalar_mult_g(&rx, &ry, privkey); fe_get_be(pubx, &rx); }
// Check if private key generates target public key X __device__ int check_pubkey_x(const uint8_t privkey[32], const uint8_t target_x[32]) { uint8_t computed_x[32]; privkey_to_pubkey_x(computed_x, privkey); #pragma unroll for (int i = 0; i < 32; i++) { if (computed_x[i] != target_x[i]) return 0; } return 1; }
// ============================================================================ // HOST INITIALIZATION // ============================================================================
// Call this once from host before using GPU functions void secp256k1_init_tables() { init_g_table_kernel<<<1, 1>>>(); cudaDeviceSynchronize(); }
|