I tried to optimize the code even further, but i have limited coding skills.
Below is the part that I tried to make so that is can be executed parallel. But I get the error: line 469: error: expected an identifier
void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)
{
uint4 S0 = *state0;
uint4 S1 = *state1;
#define A S0.x
#define B S0.y
#define C S0.z
#define D S0.w
#define E S1.x
#define F S1.y
#define G S1.z
#define H S1.w
#define k 0
#pragma unroll
for(uint k=0; k<8; k++);
RND(A,B,C,D,E,F,G,H, fixedW[(8*k)+0]);
#pragma unroll
for(uint k=0; k<8; k++);
RND(H,A,B,C,D,E,F,G, fixedW[(8*k)+1]);
#pragma unroll
for(uint k=0; k<8; k++);
RND(G,H,A,B,C,D,E,F, fixedW[(8*k)+2]);
#pragma unroll
for(uint k=0; k<8; k++);
RND(F,G,H,A,B,C,D,E, fixedW[(8*k)+3]);
#pragma unroll
for(uint k=0; k<8; k++);
RND(E,F,G,H,A,B,C,D, fixedW[(8*k)+4]);
#pragma unroll
for(uint k=0; k<8; k++);
RND(D,E,F,G,H,A,B,C, fixedW[(8*k)+5]);
#pragma unroll
for(uint k=0; k<8; k++);
RND(C,D,E,F,G,H,A,B, fixedW[(8*k)+6]);
#pragma unroll
for(uint k=0; k<8; k++);
RND(B,C,D,E,F,G,H,A, fixedW[(8*k)+7]);
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H
#undef k
*state0 += S0;
*state1 += S1;
}