I'll be referring to the implementation laid out in

https://github.com/jgarzik/cpuminer/blob/master/sha256_generic.c in this post.

The sha256_transform is called twice per "Nonce" value. The first time on the second chunk of the header:

Field | Size (Bytes) |

Merkle root (last 4 Bytes) | 4 |

Timestamp | 4 |

"Bits" | 4 |

Nonce | 4 |

SHA-256 Defined bits to make the chunk 64 Bytes total | 48 |

And then again on the resulting hash:

Field | Size (Bytes) |

Final hash of chunks 1 and 2 | 32 |

SHA-256 Defined bits to make the chunk 64 Bytes total | 32 |

Due to the nature of the input we are always using [the specific BitCoin header format] and the nature of the result we are looking for [whether the final hash is smaller than the target difficulty], we can cut out some of the calculations in sha256_transform in each of those two calls.

For the first sha256_transform call [on the 2nd chunk of the header], there are several values at the beginning that can be calculated on the first "Nonce" value, and then cached for all future "Nonce" values, as long as the other fields in that chunk do not change [I understand that the timestamp does not need to be updated that often].

Here is the beginning of the sha256_transform function:

/* load the input */

for (i = 0; i < 16; i++)

LOAD_OP(i, W, input);

/* now blend */

for (i = 16; i < 64; i++)

BLEND_OP(i, W);

/* load the state into our registers */

a=state[0]; b=state[1]; c=state[2]; d=state[3];

e=state[4]; f=state[5]; g=state[6]; h=state[7];

/* now iterate */

t1 = h + e1(e) + Ch(e,f,g) + 0x428a2f98 + W[ 0];

t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;

t1 = g + e1(d) + Ch(d,e,f) + 0x71374491 + W[ 1];

t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;

t1 = f + e1(c) + Ch(c,d,e) + 0xb5c0fbcf + W[ 2];

t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;

t1 = e + e1(b) + Ch(b,c,d) + 0xe9b5dba5 + W[ 3];

t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;

t1 = d + e1(a) + Ch(a,b,c) + 0x3956c25b + W[ 4];

t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;

t1 = c + e1(h) + Ch(h,a,b) + 0x59f111f1 + W[ 5];

t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;

t1 = b + e1(g) + Ch(g,h,a) + 0x923f82a4 + W[ 6];

t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;

t1 = a + e1(f) + Ch(f,g,h) + 0xab1c5ed5 + W[ 7];

t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;

t1 = h + e1(e) + Ch(e,f,g) + 0xd807aa98 + W[ 8];

t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;

t1 = g + e1(d) + Ch(d,e,f) + 0x12835b01 + W[ 9];

t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;

t1 = f + e1(c) + Ch(c,d,e) + 0x243185be + W[10];

t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;

t1 = e + e1(b) + Ch(b,c,d) + 0x550c7dc3 + W[11];

t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;

t1 = d + e1(a) + Ch(a,b,c) + 0x72be5d74 + W[12];

t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;

t1 = c + e1(h) + Ch(h,a,b) + 0x80deb1fe + W[13];

t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;

t1 = b + e1(g) + Ch(g,h,a) + 0x9bdc06a7 + W[14];

t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;

t1 = a + e1(f) + Ch(f,g,h) + 0xc19bf174 + W[15];

t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;

t1 = h + e1(e) + Ch(e,f,g) + 0xe49b69c1 + W[16];

t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;

t1 = g + e1(d) + Ch(d,e,f) + 0xefbe4786 + W[17];

t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;

The values of W[0] through W[17], excluding W[3] which holds the "Nonce" value, are constant when only the "Nonce" value is changing, so they can be calculated once, when hashing the chunk with the first "Nonce" value, and then used again for subsequent calls.

The cache of those 17 W values can also include the constants added into them on their respective "t1 = " lines above, so that those 17 additions are also avoided in future hashes where only the "Nonce" value has changed.

In addition, for the same reason, several calculations under "/* now iterate */" can also be cached. The contents of "state" are a constant for all values of "Nonce" on a particular header and are provided by "midstate" from getwork. This means that all calculations done without using the values of W[3] or W[18] through W[63] are constant for every value of "Nonce", and the result of these calculations can be calculated once and then just loaded for the other 2^32 - 1 values of "Nonce" afterwards.

/* load the input */

for (i = 0; i < 16; i++)

LOAD_OP(i, W, input);

/* now blend */

if (!pass1)

{

W[16] = w_16_cache;

W[17] = w_17_cache;

}

for (i = (pass1) ? 16 : 18; i < 64; i++)

BLEND_OP(i, W);

if (pass1)

{

w_16_cache = W[16];

w_17_cache = W[17];

}

/* load the state into our registers */

if (pass1)

{

a=state[0]; b=state[1]; c=state[2]; d=state[3];

e=state[4]; f=state[5]; g=state[6]; h=state[7];

}

else

{

a=state[0]; b=b_cache; c=c_cache; d=d_cache;

e=state[4]; f=f_cache; g=g_cache; h=h_cache;

}

/* now iterate */

if (pass1)

{

t1 = h + e1(e) + Ch(e,f,g) + 0x428a2f98 + W[ 0];

t2 = e0(a) + Maj(a,b,c);

d_cache = d += t1;

h_cache = h = t1 + t2;

t1 = g + e1(d) + Ch(d,e,f) + 0x71374491 + W[ 1];

t2 = e0(h) + Maj(h,a,b);

c_cache = c += t1;

g_cache = g = t1 + t2;

t1 = f + e1(c) + Ch(c,d,e) + 0xb5c0fbcf + W[ 2];

t2 = e0(g) + Maj(g,h,a);

b_cache = b += t1;

f_cache = f = t1 + t2;

t1_cache = t1 = e + e1(b) + Ch(b,c,d) + 0xe9b5dba5;

t2_cache = t2 = e0(f) + Maj(f,g,h);

}

else

{

t1 = t1_cache;

t2 = t2_cache;

}

t1 += W[ 3];

a+=t1; e=t1+t2;

if (pass1) { w_c_4_cache = d + 0x3956c25b + W[ 4]; }

t1 = e1(a) + Ch(a,b,c) + w_4_cache;

t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;

if (pass1) { w_c_5_cache = c + 0x59f111f1 + W[ 5]; }

t1 = e1(h) + Ch(h,a,b) + w_c_5_cache;

t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;

if (pass1) { w_c_6_cache = b + 0x923f82a4 + W[ 6]; }

t1 = e1(g) + Ch(g,h,a) + w_c_6_cache;

t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;

if (pass1) { w_c_7_cache = 0xab1c5ed5 + W[ 7]; }

t1 = a + e1(f) + Ch(f,g,h) + w_c_7_cache;

t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;

if (pass1) { w_c_8_cache = 0xd807aa98 + W[ 8]; }

t1 = h + e1(e) + Ch(e,f,g) + w_c_8_cache;

t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;

if (pass1) { w_c_9_cache = 0x12835b01 + W[ 9]; }

t1 = g + e1(d) + Ch(d,e,f) + w_c_9_cache;

t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;

if (pass1) { w_c_10_cache = 0x243185be + W[10]; }

t1 = f + e1(c) + Ch(c,d,e) + w_c_10_cache;

t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2;

if (pass1) { w_c_11_cache = 0x550c7dc3 + W[11]; }

t1 = e + e1(b) + Ch(b,c,d) + w_c_11_cache;

t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2;

if (pass1) { w_c_12_cache = 0x72be5d74 + W[12]; }

t1 = d + e1(a) + Ch(a,b,c) + w_c_12_cache;

t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2;

if (pass1) { w_c_13_cache = 0x80deb1fe + W[13]; }

t1 = c + e1(h) + Ch(h,a,b) + w_c_13_cache;

t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2;

if (pass1) { w_c_14_cache = 0x9bdc06a7 + W[14]; }

t1 = b + e1(g) + Ch(g,h,a) + w_c_14_cache;

t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2;

if (pass1) { w_c_15_cache = 0xc19bf174 + W[15]; }

t1 = a + e1(f) + Ch(f,g,h) + w_c_15_cache;

t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2;

if (pass1) { w_c_16_cache = 0xe49b69c1 + W[16]; }

t1 = h + e1(e) + Ch(e,f,g) + w_c_16_cache;

t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;

if (pass1) { w_c_17_cache = 0xefbe4786 + W[17]; }

t1 = g + e1(d) + Ch(d,e,f) + w_c_17_cache;

t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2;

Editted to Add:

Here is the optimization of the calculations from the second time we call sha256_transform, which is on the hash(header). Since we only want to know if the resulting hash is lower than the target number, we don't care about all 8 32-bit unsigned integers of the final hash, we just want to know enough of the high order bits [right now, this involves the 2 highest order 32-bit unsigned integers of the final hash and of the target, but may involve more of them in the future when the difficulty is sufficiently high, causing the target to be sufficiently small].

Right now, the target always has the highest order 32-bit unsigned integer, target[7], equal to 0x00000000, so the final value of state[7] can be compared to 0x00000000; if they are not equal, we have not found a solution. If they are equal than we need to compare the next highest order uint32 values, target[6] and state[6], to see if state[6] is less than target[6], etc.

Here's the code with that logic unrolled:

// partialtest is similar to the fulltest function, but only compares one 32-bit unsigned int to one other, instead of 8 of them to another 8,

// and returns:

// -1 if the first < the second [we found a solution!]

// 0 if the first == the second [we need to compare the next highest order pair to find out

// 1 if the first > the second [this is definitely not a solution, we can immediately stop calculating for this "Nonce" value;

int partialtest(u32 state, u32 target);

// this function can have a short circuit test at the beginning:

// if (state == 0x00000000 && target == 0x00000000) return 0;

// Currently target[7] is 0x00000000 and eventually target[6] will be, as well

t1 = h + e1(e) + Ch(e,f,g) + 0x748f82ee + W[56];

t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2;

t1 = g + e1(d) + Ch(d,e,f) + 0x78a5636f + W[57];

t2 = e0(h) + Maj(h,a,b); c+=t1;

u32 t1_1 = f + e1(c) + Ch(c,d,e) + 0x84c87814 + W[58];

u32 b_1 = b;

b+=t1_t1;

u32 t1_2 = e + e1(b) + Ch(b,c,d) + 0x8cc70208 + W[59];

u32 a_1 = a;

a+=t1_2;

u32 t1_3 = d + e1(a) + Ch(a,b,c) + 0x90befffa + W[60];

u32 h_1 = h;

h+=t1_3;

state[7] += h;

int testresult = partialtest(state[7], target[7]);

if (-1 == testresult)

{

// Woo! Found one!

}

else if (1 == testresult)

{

// We can stop calculating this hash, as this is not a solution.

}

// Don't know yet, have to continue calculating

g=t1+t2;

u32t1_4 = c + e1(h) + Ch(h,a,b) + 0xa4506ceb + W[61];

u32 g_1 = g;

g+=t1_4;

state[6] += g;

testresult = partialtest(state[6], target[6]);

if (-1 == testresult)

{

// Woo! Found one!

}

else if (1 == testresult)

{

// We can stop calculating this hash, as this is not a solution.

}

// Don't know yet, have to continue calculating

t2 = e0(g_1) + Maj(g_1,h_1,a_1); f=t1_1+t2;

u32 t1_5 = b + e1(g) + Ch(g,h,a) + 0xbef9a3f7 + W[62];

u32 f_1 = f;

f+=t1_5;

state[5] += f;

testresult = partialtest(state[5], target[5]);

if (-1 == testresult)

{

// Woo! Found one!

}

else if (1 == testresult)

{

// We can stop calculating this hash, as this is not a solution.

}

// Don't know yet, have to continue calculating

t2 = e0(f_1) + Maj(f_1,g_1,h_1); e=t1_2+t2;

u32 t1_6 = a + e1(f) + Ch(f,g,h) + 0xc67178f2 + W[63];

u32 e_1 = e;

e+=t1_6;

state[4] += e;

testresult = partialtest(state[4], target[4]);

if (-1 == testresult)

{

// Woo! Found one!

}

else if (1 == testresult)

{

// We can stop calculating this hash, as this is not a solution.

}

// Don't know yet, have to continue calculating

t2 = e0(e_1) + Maj(e_1,f_1,g_1); d=t1_3+t2;

state[3] += d;

testresult = partialtest(state[3], target[3]);

if (-1 == testresult)

{

// Woo! Found one!

}

else if (1 == testresult)

{

// We can stop calculating this hash, as this is not a solution.

}

// Don't know yet, have to continue calculating

t2 = e0(d) + Maj(d,e_1,f_1); c=t1_4+t2;

state[2] += c;

testresult = partialtest(state[2], target[2]);

if (-1 == testresult)

{

// Woo! Found one!

}

else if (1 == testresult)

{

// We can stop calculating this hash, as this is not a solution.

}

// Don't know yet, have to continue calculating

t2 = e0(c) + Maj(c,d,e_1); b=t1_5+t2;

state[1] += b;

testresult = partialtest(state[1], target[1]);

if (-1 == testresult)

{

// Woo! Found one!

}

else if (1 == testresult)

{

// We can stop calculating this hash, as this is not a solution.

}

// Don't know yet, have to continue calculating

t2 = e0(b) + Maj(b,c,d); a=t1_6+t2;

state[0] += a;

testresult = partialtest(state[0], target[0]);

if (-1 == testresult)

{

// Woo! Found one!

}

else if (1 == testresult)

{

// We can stop calculating this hash, as this is not a solution.

}

// We are exactly == to the target