Has this optimization been considered before?
Basically, the nonce begins to affect the hash just after 3 rounds of the first hash.
This happens because the nonce is the 19th integer in the data array, so since the first hash starts with the 16th integer it takes 3 rounds to reach the 19th.
Given block 255123, these are the states of the first 10 rounds with nonce and nonce+1:
Midstate:
1354ff89 23d83d8b 5efb84ec 9f10b472 2d2dbe11 0308f72b 01a4ae80 03d6c229
Nonce = 79f90238
round 1: ffeb12f9 1354ff89 23d83d8b 5efb84ec 9fbf88da 2d2dbe11 0308f72b 01a4ae80
round 2: 066693d3 ffeb12f9 1354ff89 23d83d8b ed0afd1d 9fbf88da 2d2dbe11 0308f72b
round 3: cb78323a 066693d3 ffeb12f9 1354ff89 129b0013 ed0afd1d 9fbf88da 2d2dbe11
round 4: b4de2b0d cb78323a 066693d3 ffeb12f9 35085b39 129b0013 ed0afd1d 9fbf88da
round 5: 0d745a43 b4de2b0d cb78323a 066693d3 38eb1f40 35085b39 129b0013 ed0afd1d
round 6: 6427e635 0d745a43 b4de2b0d cb78323a 1ce5cef6 38eb1f40 35085b39 129b0013
round 7: 6ecb89b3 6427e635 0d745a43 b4de2b0d 1e9344c6 1ce5cef6 38eb1f40 35085b39
round 8: 0aa13a06 6ecb89b3 6427e635 0d745a43 9b084155 1e9344c6 1ce5cef6 38eb1f40
round 9: 6dec3157 0aa13a06 6ecb89b3 6427e635 364cddc1 9b084155 1e9344c6 1ce5cef6
round 10: 3483fd4f 6dec3157 0aa13a06 6ecb89b3 48bd6bea 364cddc1 9b084155 1e9344c6
Nonce = 79f90239
round 1: ffeb12f9 1354ff89 23d83d8b 5efb84ec 9fbf88da 2d2dbe11 0308f72b 01a4ae80
round 2: 066693d3 ffeb12f9 1354ff89 23d83d8b ed0afd1d 9fbf88da 2d2dbe11 0308f72b
round 3: cb78323a 066693d3 ffeb12f9 1354ff89 129b0013 ed0afd1d 9fbf88da 2d2dbe11
round 4: b4de2b0e cb78323a 066693d3 ffeb12f9 35085b3a 129b0013 ed0afd1d 9fbf88da
round 5: 514c56c4 b4de2b0e cb78323a 066693d3 3ccb1fc2 35085b3a 129b0013 ed0afd1d
round 6: 76d63ec7 514c56c4 b4de2b0e cb78323a 18b611f6 3ccb1fc2 35085b3a 129b0013
round 7: 886177ab 76d63ec7 514c56c4 b4de2b0e 6b864644 18b611f6 3ccb1fc2 35085b3a
round 8: 41d34740 886177ab 76d63ec7 514c56c4 cbd24ac7 6b864644 18b611f6 3ccb1fc2
round 9: c7abc465 41d34740 886177ab 76d63ec7 70b8c519 cbd24ac7 6b864644 18b611f6
round 10: d116caaa c7abc465 41d34740 886177ab 883466b7 70b8c519 cbd24ac7 6b864644
As you can see, the first 3 rounds look exactly the same. Precalculation increments the total hashing power by 1.2% which is not so much but its measurable... Actually, you can tell that a couple of other rounds can be partially precalculated because many integers in rounds 4 and 5 still don't change: I believe at least a 2% performance gain can be achieved with that in mind.
For an example implementation, I'm using a slightly modified version of the code posted here:
https://bitcointalk.org/index.php?topic=286532.0Any feedback is welcome.
Here's the final code (still no SSE
):
sha256_cpu.c
#include "sha256_cpu.h"
// Timer setup
#include <windows.h>
typedef struct {
LARGE_INTEGER start;
LARGE_INTEGER stop; } stopWatch;
void startTimer( stopWatch *timer);
void stopTimer( stopWatch *timer);
double LIToSecs( LARGE_INTEGER * L);
double getElapsedTime( stopWatch *timer);
double LIToSecs( LARGE_INTEGER * L) {
LARGE_INTEGER frequency;
QueryPerformanceFrequency( &frequency ) ;
return ((double)L->QuadPart /(double)frequency.QuadPart) ; }
double getElapsedTime( stopWatch *timer) {
LARGE_INTEGER time;
time.QuadPart = timer->stop.QuadPart - timer->start.QuadPart;
return LIToSecs( &time) ;
}
stopWatch s;
// End timer setup
int main() {
// Big-endian 255123 block string
uchar text[]="02000000"
"61b9273640571357bdc428788b36ae9827349e9d40627d2d2d00000000000000"
"b1eb3bce1dde137625382e9445e707e6ec3f9b46948d2a7d8d88da42a877104d"
"5f1c2152"
"57524119"
"79f90238"
"800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000280"
;
// Little-endian version:
// 000000023627b961571357407828c4bd98ae368b9d9e34272d7d62400000002d00000000ce3bebb1
// 7613de1d942e3825e607e745469b3fec7d2a8d9442da888d4d1077a852211c5f194152573802f979
// 00000080000000000000000000000000000000000000000000000000000000000000000000000000
// 0000000080020000
/** STRING PRE-PROCESS **/
uint i;
uchar *pos = text;
static uint text1[32];
// convert chars to hex values. Evil dump into text1
for(i = 0; i < 128; i++) {
*((uchar*)text1+i) = htochar(pos);
pos+=2;
}
// Switch endianness
// You can remove the following loop if you are already
// working with a little-endian string
for(i = 0; i < 32; i++) {
text1[i] = byte_swap4(text1[i]);
}
// Pre-process finished.
// String is loaded into text1 as 32 binary u-integers.
/** MIDSTATE CALCULATION **/
uint midstate[8], midstate2[8];
sha256_MS(text1, midstate, midstate2);
static uint res;
uint *ptroff = &text1[16];
text1[19] = 0x79f90238U;
printf("Starting nonce: %08x\n", text1[19]);
QueryPerformanceCounter(&s.start); // Windows API to build a timer...
/** MAIN LOOP **/
const uint endnonce = 0x79f90238U + 1000000U; // 860 Kh/s: hash rate on my Core 2 Duo 2.333 GHz
for (text1[19] = 0x79f90238U; text1[19] < endnonce; text1[19]++) { // Increment nonce
res = sha256d(midstate, midstate2, ptroff); // The kraken. Release it.
if (res == 0) printf("Share found at nonce: %08x SUCCESS\n", text1[19]);
}
QueryPerformanceCounter(&s.stop);
printf("Ending nonce: %08x\n\n", --text1[19]);
printf("Total time taken: %f secs\n", getElapsedTime(&s));
printf("Estimated hashrate: %f Mh/s\n", 1.0/getElapsedTime(&s) );
// Btw real hash is:
// e17e38f81b4af47ab2ff29fe554c8c767c03444aee9119381f00000000000000
// 000000000000001f381991ee4a44037c768c4c55fe29ffb27af44a1bf8387ee1
printf("You can now safely terminate the program.\n");
getchar();
return 0;
}
sha256_cpu.h
#include <stdio.h>
#define uchar unsigned char
#define uint unsigned int
uchar htochar(uchar *ptr) {
uchar value = 0;
char ch = *ptr;
if (ch >= '0' && ch <= '9')
value = (value << 4) + (ch - '0');
else
value = (value << 4) + (ch - 'a' + 10);
ch = *(++ptr);
if (ch >= '0' && ch <= '9')
value = (value << 4) + (ch - '0');
else
value = (value << 4) + (ch - 'a' + 10);
return value;
}
#define byte_swap4(val) \
(((val & 0xff) << 24) | \
((val & 0xff00) << 8) | \
((val & 0xff0000) >> 8) | \
((val & 0xff000000) >> 24))
#define ROTLEFT(a,b) ((a << b) | (a >> (32-b)))
#define ROTRIGHT(a,b) ((a >> b) | (a << (32-b)))
#define CH(x,y,z) ((x & y) ^ (~x & z))
#define MAJ(x,y,z) ((x & y) ^ (x & z) ^ (y & z))
#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ (x >> 3))
#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ (x >> 10))
static const uint k[64] = {
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
};
void sha256_MS(uint data[], uint midstate[], uint midstate2[]) {
uint a,b,c,d,e,f,g,h,i=0,t1,t2,m[64];
a = 0x6a09e667U;
b = 0xbb67ae85U;
c = 0x3c6ef372U;
d = 0xa54ff53aU;
e = 0x510e527fU;
f = 0x9b05688cU;
g = 0x1f83d9abU;
h = 0x5be0cd19U;
for (; i < 16; i++) m[i] = data[i];
for (; i < 64; i++)
m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];
for (i = 0; i < 64; ++i) {
t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
t2 = EP0(a) + MAJ(a,b,c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
}
midstate[0] = 0x6a09e667U + a;
midstate[1] = 0xbb67ae85U + b;
midstate[2] = 0x3c6ef372U + c;
midstate[3] = 0xa54ff53aU + d;
midstate[4] = 0x510e527fU + e;
midstate[5] = 0x9b05688cU + f;
midstate[6] = 0x1f83d9abU + g;
midstate[7] = 0x5be0cd19U + h;
// OPTIMIZATION: redundant rounds precalculation
a = midstate[0];
b = midstate[1];
c = midstate[2];
d = midstate[3];
e = midstate[4];
f = midstate[5];
g = midstate[6];
h = midstate[7];
for (i = 0; i < 3; ++i) { // precalculate 3 rounds
t1 = h + EP1(e) + CH(e,f,g) + k[i] + data[i+16];
t2 = EP0(a) + MAJ(a,b,c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
}
midstate2[0] = a;
midstate2[1] = b;
midstate2[2] = c;
midstate2[3] = d;
midstate2[4] = e;
midstate2[5] = f;
midstate2[6] = g;
midstate2[7] = h;
}
/*a = 0xcb78323aU;
b = 0x066693d3U;
c = 0xffeb12f9U;
d = 0x1354ff89U;
e = 0x129b0013U;
f = 0xed0afd1dU;
g = 0x9fbf88daU;
h = 0x2d2dbe11U;*/
uint sha256d(uint midstate[], uint midstate2[], uint data[]) {
uint a,b,c,d,e,f,g,h,i,t1,t2,m[64];
uint ee,eee,eeee;
// Hash One
// Init by preloading rounds 1-3.
a = midstate2[0];
b = midstate2[1];
c = midstate2[2];
d = midstate2[3];
e = midstate2[4];
f = midstate2[5];
g = midstate2[6];
h = midstate2[7];
for (i = 0; i < 16; i++) m[i] = data[i]; // data points to text1[16] in main().
for (; i < 64; i++)
m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];
for (i = 3; i < 64; i++) { // Late start. Rounds 1-3 are already calculated.
t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
t2 = EP0(a) + MAJ(a,b,c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
}
m[0] = midstate[0] + a;
m[1] = midstate[1] + b;
m[2] = midstate[2] + c;
m[3] = midstate[3] + d;
m[4] = midstate[4] + e;
m[5] = midstate[5] + f;
m[6] = midstate[6] + g;
m[7] = midstate[7] + h;
// Hash Two
a = 0x6a09e667U;
b = 0xbb67ae85U;
c = 0x3c6ef372U;
d = 0xa54ff53aU;
e = 0x510e527fU;
f = 0x9b05688cU;
g = 0x1f83d9abU;
h = 0x5be0cd19U;
// Add padding string w/ length info
m[8] = 0x80000000U;
m[9] = 0x00U;
m[10] = 0x00U;
m[11] = 0x00U;
m[12] = 0x00U;
m[13] = 0x00U;
m[14] = 0x00U;
m[15] = 0x100U;
for (i = 16; i < 64; i++)
m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];
for (i = 0; i < 57; i++) {
t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
t2 = EP0(a) + MAJ(a,b,c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
}
// OPTIMIZATION: Early 2nd hash termination
eeee = d + h + EP1(e) + CH(e,f,g) + 0x78a5636fU + m[57];
eee = c + g + EP1(eeee) + CH(eeee,e,f) + 0x84c87814U + m[58];
ee = b + f + EP1(eee) + CH(eee,eeee,e) + 0x8cc70208U + m[59];
h = a + e + EP1(ee) + CH(ee,eee,eeee) + 0x90befffaU + m[60];
return 0x5be0cd19U + h; // Unfortunately, only the last 32 bits are correct... Well, it's still a diff 1 share :)
}