Performance gain with simple sha256d optimization

Has this optimization been considered before?

Basically, the nonce begins to affect the hash just after 3 rounds of the first hash.
This happens because the nonce is the 19th integer in the data array, so since the first hash starts with the 16th integer it takes 3 rounds to reach the 19th.

Given block 255123, these are the states of the first 10 rounds with nonce and nonce+1:

Midstate:
1354ff89 23d83d8b 5efb84ec 9f10b472 2d2dbe11 0308f72b 01a4ae80 03d6c229

Nonce = 79f90238
round 1: ffeb12f9 1354ff89 23d83d8b 5efb84ec 9fbf88da 2d2dbe11 0308f72b 01a4ae80
round 2: 066693d3 ffeb12f9 1354ff89 23d83d8b ed0afd1d 9fbf88da 2d2dbe11 0308f72b
round 3: cb78323a 066693d3 ffeb12f9 1354ff89 129b0013 ed0afd1d 9fbf88da 2d2dbe11
round 4: b4de2b0d cb78323a 066693d3 ffeb12f9 35085b39 129b0013 ed0afd1d 9fbf88da
round 5: 0d745a43 b4de2b0d cb78323a 066693d3 38eb1f40 35085b39 129b0013 ed0afd1d
round 6: 6427e635 0d745a43 b4de2b0d cb78323a 1ce5cef6 38eb1f40 35085b39 129b0013
round 7: 6ecb89b3 6427e635 0d745a43 b4de2b0d 1e9344c6 1ce5cef6 38eb1f40 35085b39
round 8: 0aa13a06 6ecb89b3 6427e635 0d745a43 9b084155 1e9344c6 1ce5cef6 38eb1f40
round 9: 6dec3157 0aa13a06 6ecb89b3 6427e635 364cddc1 9b084155 1e9344c6 1ce5cef6
round 10: 3483fd4f 6dec3157 0aa13a06 6ecb89b3 48bd6bea 364cddc1 9b084155 1e9344c6

Nonce = 79f90239
round 1: ffeb12f9 1354ff89 23d83d8b 5efb84ec 9fbf88da 2d2dbe11 0308f72b 01a4ae80
round 2: 066693d3 ffeb12f9 1354ff89 23d83d8b ed0afd1d 9fbf88da 2d2dbe11 0308f72b
round 3: cb78323a 066693d3 ffeb12f9 1354ff89 129b0013 ed0afd1d 9fbf88da 2d2dbe11
round 4: b4de2b0e cb78323a 066693d3 ffeb12f9 35085b3a 129b0013 ed0afd1d 9fbf88da
round 5: 514c56c4 b4de2b0e cb78323a 066693d3 3ccb1fc2 35085b3a 129b0013 ed0afd1d
round 6: 76d63ec7 514c56c4 b4de2b0e cb78323a 18b611f6 3ccb1fc2 35085b3a 129b0013
round 7: 886177ab 76d63ec7 514c56c4 b4de2b0e 6b864644 18b611f6 3ccb1fc2 35085b3a
round 8: 41d34740 886177ab 76d63ec7 514c56c4 cbd24ac7 6b864644 18b611f6 3ccb1fc2
round 9: c7abc465 41d34740 886177ab 76d63ec7 70b8c519 cbd24ac7 6b864644 18b611f6
round 10: d116caaa c7abc465 41d34740 886177ab 883466b7 70b8c519 cbd24ac7 6b864644

As you can see, the first 3 rounds look exactly the same. Precalculation increments the total hashing power by 1.2% which is not so much but its measurable... Actually, you can tell that a couple of other rounds can be partially precalculated because many integers in rounds 4 and 5 still don't change: I believe at least a 2% performance gain can be achieved with that in mind.

For an example implementation, I'm using a slightly modified version of the code posted here:
https://bitcointalk.org/index.php?topic=286532.0

Any feedback is welcome.

Here's the final code (still no SSE Embarrassed

):

sha256_cpu.c

Code:

#include "sha256_cpu.h"

// Timer setup
#include <windows.h>

typedef struct {
    LARGE_INTEGER start;
    LARGE_INTEGER stop; } stopWatch;
  void startTimer( stopWatch *timer);
  void stopTimer( stopWatch *timer);
  double LIToSecs( LARGE_INTEGER * L);
  double getElapsedTime( stopWatch *timer);

double LIToSecs( LARGE_INTEGER * L) {
    LARGE_INTEGER frequency;
    QueryPerformanceFrequency( &frequency ) ;
    return ((double)L->QuadPart /(double)frequency.QuadPart) ; }
double getElapsedTime( stopWatch *timer) {
    LARGE_INTEGER time;
    time.QuadPart = timer->stop.QuadPart - timer->start.QuadPart;
    return LIToSecs( &time) ;
}

stopWatch s;
// End timer setup

int main() {

    // Big-endian 255123 block string
    uchar text[]="02000000"
    "61b9273640571357bdc428788b36ae9827349e9d40627d2d2d00000000000000"
    "b1eb3bce1dde137625382e9445e707e6ec3f9b46948d2a7d8d88da42a877104d"
    "5f1c2152"
    "57524119"
    "79f90238"
    "800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000280"
    ;

    // Little-endian version:

    // 000000023627b961571357407828c4bd98ae368b9d9e34272d7d62400000002d00000000ce3bebb1
    // 7613de1d942e3825e607e745469b3fec7d2a8d9442da888d4d1077a852211c5f194152573802f979
    // 00000080000000000000000000000000000000000000000000000000000000000000000000000000
    // 0000000080020000

    /** STRING PRE-PROCESS **/

    uint i;
    uchar *pos = text;
    static uint text1[32];

    // convert chars to hex values. Evil dump into text1
    for(i = 0; i < 128; i++) {
        *((uchar*)text1+i) = htochar(pos);
        pos+=2;
    }

    // Switch endianness
    // You can remove the following loop if you are already
    // working with a little-endian string
    for(i = 0; i < 32; i++) {
        text1[i] = byte_swap4(text1[i]);
    }

    // Pre-process finished.
    // String is loaded into text1 as 32 binary u-integers.

    /** MIDSTATE CALCULATION **/

    uint midstate[8], midstate2[8];
    sha256_MS(text1, midstate, midstate2);

    static uint res;
    uint *ptroff = &text1[16];

    text1[19] = 0x79f90238U;
    printf("Starting nonce: %08x\n", text1[19]);

    QueryPerformanceCounter(&s.start); // Windows API to build a timer...

    /** MAIN LOOP **/

    const uint endnonce = 0x79f90238U + 1000000U; // 860 Kh/s: hash rate on my Core 2 Duo 2.333 GHz

    for (text1[19] = 0x79f90238U; text1[19] < endnonce; text1[19]++) { // Increment nonce

        res = sha256d(midstate, midstate2, ptroff); // The kraken. Release it.
        if (res == 0) printf("Share found at nonce: %08x SUCCESS\n", text1[19]);
    }

    QueryPerformanceCounter(&s.stop);

    printf("Ending nonce: %08x\n\n", --text1[19]);
    printf("Total time taken: %f secs\n", getElapsedTime(&s));
    printf("Estimated hashrate: %f Mh/s\n", 1.0/getElapsedTime(&s) );

    // Btw real hash is:
    // e17e38f81b4af47ab2ff29fe554c8c767c03444aee9119381f00000000000000
    // 000000000000001f381991ee4a44037c768c4c55fe29ffb27af44a1bf8387ee1

    printf("You can now safely terminate the program.\n");
    getchar();
    return 0;
}

sha256_cpu.h

Code:

#include <stdio.h>

#define uchar unsigned char
#define uint unsigned int

uchar htochar(uchar *ptr) {
    uchar value = 0;
    char ch = *ptr;

    if (ch >= '0' && ch <= '9')
        value = (value << 4) + (ch - '0');
    else
        value = (value << 4) + (ch - 'a' + 10);

    ch = *(++ptr);

    if (ch >= '0' && ch <= '9')
        value = (value << 4) + (ch - '0');
    else
        value = (value << 4) + (ch - 'a' + 10);

    return value;
}

#define byte_swap4(val)              \
         (((val & 0xff) << 24) |     \
          ((val & 0xff00) << 8) |    \
          ((val & 0xff0000) >> 8) |  \
          ((val & 0xff000000) >> 24))

#define ROTLEFT(a,b) ((a << b) | (a >> (32-b)))
#define ROTRIGHT(a,b) ((a >> b) | (a << (32-b)))

#define CH(x,y,z) ((x & y) ^ (~x & z))
#define MAJ(x,y,z) ((x & y) ^ (x & z) ^ (y & z))
#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ (x >> 3))
#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ (x >> 10))

static const uint k[64] = {
   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
};


void sha256_MS(uint data[], uint midstate[], uint midstate2[]) {
    uint a,b,c,d,e,f,g,h,i=0,t1,t2,m[64];

    a = 0x6a09e667U;
    b = 0xbb67ae85U;
    c = 0x3c6ef372U;
    d = 0xa54ff53aU;
    e = 0x510e527fU;
    f = 0x9b05688cU;
    g = 0x1f83d9abU;
    h = 0x5be0cd19U;

    for (; i < 16; i++) m[i] = data[i];

    for (; i < 64; i++)
        m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];

    for (i = 0; i < 64; ++i) {
        t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
        t2 = EP0(a) + MAJ(a,b,c);
        h = g;
        g = f;
        f = e;
        e = d + t1;
        d = c;
        c = b;
        b = a;
        a = t1 + t2;
    }

    midstate[0] = 0x6a09e667U + a;
    midstate[1] = 0xbb67ae85U + b;
    midstate[2] = 0x3c6ef372U + c;
    midstate[3] = 0xa54ff53aU + d;
    midstate[4] = 0x510e527fU + e;
    midstate[5] = 0x9b05688cU + f;
    midstate[6] = 0x1f83d9abU + g;
    midstate[7] = 0x5be0cd19U + h;

    // OPTIMIZATION: redundant rounds precalculation

    a = midstate[0];
    b = midstate[1];
    c = midstate[2];
    d = midstate[3];
    e = midstate[4];
    f = midstate[5];
    g = midstate[6];
    h = midstate[7];

    for (i = 0; i < 3; ++i) { // precalculate 3 rounds
        t1 = h + EP1(e) + CH(e,f,g) + k[i] + data[i+16];
        t2 = EP0(a) + MAJ(a,b,c);
        h = g;
        g = f;
        f = e;
        e = d + t1;
        d = c;
        c = b;
        b = a;
        a = t1 + t2;
    }

    midstate2[0] = a;
    midstate2[1] = b;
    midstate2[2] = c;
    midstate2[3] = d;
    midstate2[4] = e;
    midstate2[5] = f;
    midstate2[6] = g;
    midstate2[7] = h;

}


/*a = 0xcb78323aU;
b = 0x066693d3U;
c = 0xffeb12f9U;
d = 0x1354ff89U;
e = 0x129b0013U;
f = 0xed0afd1dU;
g = 0x9fbf88daU;
h = 0x2d2dbe11U;*/


uint sha256d(uint midstate[], uint midstate2[], uint data[]) {
    uint a,b,c,d,e,f,g,h,i,t1,t2,m[64];
    uint ee,eee,eeee;

    // Hash One

    // Init by preloading rounds 1-3.
    a = midstate2[0];
    b = midstate2[1];
    c = midstate2[2];
    d = midstate2[3];
    e = midstate2[4];
    f = midstate2[5];
    g = midstate2[6];
    h = midstate2[7];

    for (i = 0;  i < 16; i++) m[i] = data[i]; // data points to text1[16] in main().

    for (; i < 64; i++)
        m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];

    for (i = 3; i < 64; i++) { // Late start. Rounds 1-3 are already calculated.
        t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
        t2 = EP0(a) + MAJ(a,b,c);
        h = g;
        g = f;
        f = e;
        e = d + t1;
        d = c;
        c = b;
        b = a;
        a = t1 + t2;

    }

    m[0] = midstate[0] + a;
    m[1] = midstate[1] + b;
    m[2] = midstate[2] + c;
    m[3] = midstate[3] + d;
    m[4] = midstate[4] + e;
    m[5] = midstate[5] + f;
    m[6] = midstate[6] + g;
    m[7] = midstate[7] + h;

    // Hash Two

    a = 0x6a09e667U;
    b = 0xbb67ae85U;
    c = 0x3c6ef372U;
    d = 0xa54ff53aU;
    e = 0x510e527fU;
    f = 0x9b05688cU;
    g = 0x1f83d9abU;
    h = 0x5be0cd19U;

    // Add padding string w/ length info
    m[8]  = 0x80000000U;
    m[9]  = 0x00U;
    m[10] = 0x00U;
    m[11] = 0x00U;
    m[12] = 0x00U;
    m[13] = 0x00U;
    m[14] = 0x00U;
    m[15] = 0x100U;

    for (i = 16; i < 64; i++)
        m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];

    for (i = 0; i < 57; i++) {
        t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
        t2 = EP0(a) + MAJ(a,b,c);
        h = g;
        g = f;
        f = e;
        e = d + t1;
        d = c;
        c = b;
        b = a;
        a = t1 + t2;
    }

    // OPTIMIZATION: Early 2nd hash termination

    eeee = d + h + EP1(e) + CH(e,f,g) + 0x78a5636fU + m[57];
    eee = c + g + EP1(eeee) + CH(eeee,e,f) + 0x84c87814U + m[58];
    ee = b + f + EP1(eee) + CH(eee,eeee,e) + 0x8cc70208U + m[59];
    h = a + e + EP1(ee) + CH(ee,eee,eeee) + 0x90befffaU + m[60];

    return 0x5be0cd19U + h; // Unfortunately, only the last 32 bits are correct... Well, it's still a diff 1 share :)
}