Bitcoin Forum
November 13, 2024, 03:59:00 AM *
News: Latest Bitcoin Core release: 28.0 [Torrent]
 
   Home   Help Search Login Register More  
Pages: [1]
  Print  
Author Topic: Verilog one-cycle hasher code  (Read 1167 times)
takanasi (OP)
Newbie
*
Offline Offline

Activity: 3
Merit: 0


View Profile
July 03, 2013, 02:32:53 PM
 #1

I have just registered, so I am writing here instead of the FPGA miner topic where as a newbie I don't have access yet.

I want to share Verilog code of a purely combinational hasher. To the best of my knowledge, all available FPGA miners require more than one clock cycle to process one nonce value (e.g. 6 clock cycles with kramble's miner for the de0-nano board). This code, in contrast, requires only one cycle to calculate two SHA-256 hashes.

Please note that the code was tested by simulation only, as there is no hardware available here to run it. For Cyclone IV E, the implementation synthesizes into about 55000 LEs, which would fit into Terasic DE2-115, for example.

Icarus Verilog should be called as follows:

Code:
iverilog -Wall -DSIM -o comb comb.v

And, finally, here goes the code itself:

Code:
`define INDEX(a, i) a[32 * (i)+:32]
`define ROR32(x, n) {x[(n) - 1:0], x[31:(n)]}
 
module round(data, word, hash);
input [255:0] data;
input [31:0] word;
output [255:0] hash;
 
wire [31:0] a, b, c, d, e, f, g, h;
wire [31:0] s1, ch, t1, s0, maj, t2;
 
assign a = `INDEX(data, 0);
assign b = `INDEX(data, 1);
assign c = `INDEX(data, 2);
assign d = `INDEX(data, 3);
assign e = `INDEX(data, 4);
assign f = `INDEX(data, 5);
assign g = `INDEX(data, 6);
assign h = `INDEX(data, 7);
 
assign s1 = `ROR32(e, 6) ^ `ROR32(e, 11) ^ `ROR32(e, 25);
assign ch = g ^ (e & (f ^ g));
assign t1 = h + s1 + ch + word;
assign s0 = `ROR32(a, 2) ^ `ROR32(a, 13) ^ `ROR32(a, 22);
assign maj = (a & b) | (c & (a | b));
assign t2 = s0 + maj;
 
assign hash = {g, f, e, d + t1, c, b, a, t1 + t2};
endmodule
 
module chain(init, data, hash);
input [255:0] init;
input [511:0] data;
output [255:0] hash;
 
wire [31:0] words [0:63];
 
localparam konst = {
32'h428a2f98, 32'h71374491, 32'hb5c0fbcf, 32'he9b5dba5,
32'h3956c25b, 32'h59f111f1, 32'h923f82a4, 32'hab1c5ed5,
32'hd807aa98, 32'h12835b01, 32'h243185be, 32'h550c7dc3,
32'h72be5d74, 32'h80deb1fe, 32'h9bdc06a7, 32'hc19bf174,
32'he49b69c1, 32'hefbe4786, 32'h0fc19dc6, 32'h240ca1cc,
32'h2de92c6f, 32'h4a7484aa, 32'h5cb0a9dc, 32'h76f988da,
32'h983e5152, 32'ha831c66d, 32'hb00327c8, 32'hbf597fc7,
32'hc6e00bf3, 32'hd5a79147, 32'h06ca6351, 32'h14292967,
32'h27b70a85, 32'h2e1b2138, 32'h4d2c6dfc, 32'h53380d13,
32'h650a7354, 32'h766a0abb, 32'h81c2c92e, 32'h92722c85,
32'ha2bfe8a1, 32'ha81a664b, 32'hc24b8b70, 32'hc76c51a3,
32'hd192e819, 32'hd6990624, 32'hf40e3585, 32'h106aa070,
32'h19a4c116, 32'h1e376c08, 32'h2748774c, 32'h34b0bcb5,
32'h391c0cb3, 32'h4ed8aa4a, 32'h5b9cca4f, 32'h682e6ff3,
32'h748f82ee, 32'h78a5636f, 32'h84c87814, 32'h8cc70208,
32'h90befffa, 32'ha4506ceb, 32'hbef9a3f7, 32'hc67178f2
};
 
genvar i;
 
generate
for (i = 0; i < 16; i = i + 1) begin: first
assign words[i] = `INDEX(data, i);
end
 
for (i = 16; i < 64; i = i + 1) begin: blend
wire [31:0] s0, s1;
 
assign s0 = `ROR32(words[i - 15], 7) ^
`ROR32(words[i - 15], 18) ^
(words[i - 15] >> 3);
assign s1 = `ROR32(words[i - 2], 17) ^
`ROR32(words[i - 2], 19) ^
(words[i - 2] >> 10);
assign words[i] = words[i - 16] + words[i - 7] +
s0 + s1;
end
 
for (i = 0; i < 64; i = i + 1) begin: array
wire [31:0] w = `INDEX(konst, 63 - i) + words[i];
wire [255:0] d, h;
 
if (i)
assign d = array[i - 1].h;
else
assign d = init;
 
round step(d, w, h);
end
 
for (i = 0; i < 8; i = i + 1) begin: adder
assign `INDEX(hash, i) = `INDEX(array[63].h, i) +
`INDEX(init, i);
end
endgenerate
endmodule
 
module miner(clk, data, mid, result, found);
input clk;
input [255:0] mid;
input [127:0] data;
output [31:0] result;
output found;
 
reg [31:0] nonce = 32'b0;
 
wire [255:0] hash1, hash;
 
chain sha1(mid, {
32'h00000280, 32'h00000000, 32'h00000000, 32'h00000000,
32'h00000000, 32'h00000000, 32'h00000000, 32'h00000000,
32'h00000000, 32'h00000000, 32'h00000000, 32'h80000000,
result, data[95:0]
}, hash1);
 
chain sha2({
32'h5be0cd19, 32'h1f83d9ab, 32'h9b05688c, 32'h510e527f,
32'ha54ff53a, 32'h3c6ef372, 32'hbb67ae85, 32'h6a09e667
}, {
32'h00000100, 32'h00000000, 32'h00000000, 32'h00000000,
32'h00000000, 32'h00000000, 32'h00000000, 32'h80000000,
hash1
}, hash);
 
assign result = nonce + data[127:96];
assign found = (32'b0 == `INDEX(hash, 7));
 
always @(posedge clk)
nonce <= nonce + 1;
endmodule
 
`ifdef SIM
module proof;
reg clk = 1'b0;
 
wire [31:0] result;
wire found;
 
miner core(clk, {
32'h381353f8, 32'h378a0e1c, 32'hfd270c51, 32'ha7f5f990
}, {
32'h00f10dad, 32'hacc0caa8, 32'h7482c0f3, 32'ha66f356d,
32'hdb1ff3ca, 32'hfb545b91, 32'h1efebbc8, 32'h24e39e50
}, result, found);
 
always begin
if (result > 32'h381353fa)
$finish;
 
#1 clk <= ~clk;
#1 clk <= ~clk;
 
$display("%x %x %x", result, found, core.hash);
end
endmodule
`endif
Trillium
Hero Member
*****
Offline Offline

Activity: 546
Merit: 500



View Profile
July 03, 2013, 04:46:07 PM
 #2

I think the mods can move posts like this to the appropriate subforum and you have rights to post in your own thread you created.

Unfortunately I do not have enough understanding of your code to validate it or comment, but looks interesting.

BTC:1AaaAAAAaAAE2L1PXM1x9VDNqvcrfa9He6
kramble
Sr. Member
****
Offline Offline

Activity: 384
Merit: 250



View Profile WWW
July 03, 2013, 04:53:33 PM
 #3

Unfortunately I do not have enough understanding of your code to validate it or comment, but looks interesting.

It does look interesting. I'm compiling it right now (in a wrapper based on DE2_115_Unoptimized_Pipelined), it does take around 55kLE but has not yet completed fitting (after one hour elapsed). It will be interesting to see what FMAX will be reported (I suspect this is why its not been done already, one huge combinatorial chain is just the antithesis of pipelining).

Actually I just killed it and started again. Its ridiculous to expect this to run at the default 50MHz, I'll initially try 100kHz and work upwards (I really don't expect this to perform very well at all).

Github https://github.com/kramble BLC BkRaMaRkw3NeyzsZ2zUgXsNLogVVkQ1iPV
takanasi (OP)
Newbie
*
Offline Offline

Activity: 3
Merit: 0


View Profile
July 03, 2013, 05:06:37 PM
 #4

Hello kramble and thank you for testing this. Would you mind sharing your further feedback when and if you have any.
kramble
Sr. Member
****
Offline Offline

Activity: 384
Merit: 250



View Profile WWW
July 03, 2013, 05:15:03 PM
Last edit: July 03, 2013, 06:02:42 PM by kramble
 #5

Hello kramble and thank you for testing this. Would you mind sharing your further feedback when and if you have any.

Will do, but I'm not very optimistic. The lack of any pipelining will require a very slow clock, so the hash rate is going to be quite low. But I'm no expert on logic design, so get your browse time up to 4 hours and come and discuss on the main forum https://bitcointalk.org/index.php?topic=9047.0

[EDIT]
I did a build at 500kHz clock (50MHz divide 100 in the PLL), which took about 30 minutes, but now quartus_sta (Timequest Timing Analyser) is just spinning on CPU. I suspect it just does not like the enormous combinatorial chain, so there is no knowing when it might complete. The SOF was created though, but I can't actually test it myself as I don't have the DE2 dev system (just a DE0-Nano which is too small to fit this code).

Anyway, as I said above, this is not the way to go about improving performance as its the total throughput that matters, not the time to generate each individual hash, hence the extensive use of pipelining in the official code, but kudos for coming up with an interesting approach.

I'm going to leave it there for now. See you on the main thread once you've escaped newbie jail  Cheesy

Github https://github.com/kramble BLC BkRaMaRkw3NeyzsZ2zUgXsNLogVVkQ1iPV
takanasi (OP)
Newbie
*
Offline Offline

Activity: 3
Merit: 0


View Profile
July 04, 2013, 09:44:40 AM
Last edit: July 05, 2013, 01:02:28 PM by takanasi
 #6

Still incarcerated in the newbie cell, I nevertheless want to dump here another version of the combinational hasher code. This time it is pipelined, and thus should have better timing characteristics. I am not requesting any testing right now, just showing the code.

Code:
`define INDEX(a, i) a[32 * (i)+:32]
`define ROR32(x, n) {x[(n) - 1:0], x[31:(n)]}

module sched(w0, w1, w9, w14, w16);
input [31:0] w0, w1, w9, w14;
output [31:0] w16;

wire [31:0] s0, s1;

assign s0 = `ROR32(w1, 7) ^ `ROR32(w1, 18) ^ (w1 >> 3);
assign s1 = `ROR32(w14, 17) ^ `ROR32(w14, 19) ^ (w14 >> 10);

assign w16 = w0 + s0 + w9 + s1;
endmodule

module round(in, word, out);
input [255:0] in;
input [31:0] word;
output [255:0] out;

wire [31:0] a, b, c, d, e, f, g, h;
wire [31:0] s1, ch, t1, s0, maj, t2;

assign {h, g, f, e, d, c, b, a} = in;

assign s1 = `ROR32(e, 6) ^ `ROR32(e, 11) ^ `ROR32(e, 25);
assign ch = g ^ (e & (f ^ g));
assign t1 = h + s1 + ch + word;
assign s0 = `ROR32(a, 2) ^ `ROR32(a, 13) ^ `ROR32(a, 22);
assign maj = (a & b) | (c & (a | b));
assign t2 = s0 + maj;

assign out = {g, f, e, d + t1, c, b, a, t1 + t2};
endmodule

module synch(clk, prev, konst, next);
input clk;
input [767:0] prev;
input [31:0] konst;
output reg [767:0] next;

wire [255:0] result;
wire [31:0] w0, w1, w9, w14, w16;

assign w0 = `INDEX(prev, 0);
assign w1 = `INDEX(prev, 1);
assign w9 = `INDEX(prev, 9);
assign w14 = `INDEX(prev, 14);

sched link(w0, w1, w9, w14, w16);

round step(prev[767:512], konst + w0, result);

always @(posedge clk)
next <= {result, w16, prev[511:32]};
endmodule

module chain(clk, init, data, hash);
input clk;
input [255:0] init;
input [511:0] data;
output [255:0] hash;

wire [2047:0] konst;

assign konst = {
32'hc67178f2, 32'hbef9a3f7, 32'ha4506ceb, 32'h90befffa,
32'h8cc70208, 32'h84c87814, 32'h78a5636f, 32'h748f82ee,
32'h682e6ff3, 32'h5b9cca4f, 32'h4ed8aa4a, 32'h391c0cb3,
32'h34b0bcb5, 32'h2748774c, 32'h1e376c08, 32'h19a4c116,
32'h106aa070, 32'hf40e3585, 32'hd6990624, 32'hd192e819,
32'hc76c51a3, 32'hc24b8b70, 32'ha81a664b, 32'ha2bfe8a1,
32'h92722c85, 32'h81c2c92e, 32'h766a0abb, 32'h650a7354,
32'h53380d13, 32'h4d2c6dfc, 32'h2e1b2138, 32'h27b70a85,
32'h14292967, 32'h06ca6351, 32'hd5a79147, 32'hc6e00bf3,
32'hbf597fc7, 32'hb00327c8, 32'ha831c66d, 32'h983e5152,
32'h76f988da, 32'h5cb0a9dc, 32'h4a7484aa, 32'h2de92c6f,
32'h240ca1cc, 32'h0fc19dc6, 32'hefbe4786, 32'he49b69c1,
32'hc19bf174, 32'h9bdc06a7, 32'h80deb1fe, 32'h72be5d74,
32'h550c7dc3, 32'h243185be, 32'h12835b01, 32'hd807aa98,
32'hab1c5ed5, 32'h923f82a4, 32'h59f111f1, 32'h3956c25b,
32'he9b5dba5, 32'hb5c0fbcf, 32'h71374491, 32'h428a2f98
};

genvar i;

generate
for (i = 0; i < 64; i = i + 1) begin: array
wire [767:0] prev, next;

if (i)
assign prev = array[i - 1].next;
else
assign prev = {init, data};

synch node(clk, prev, `INDEX(konst, i), next);
end

for (i = 0; i < 8; i = i + 1) begin: adder
assign `INDEX(hash, i) = `INDEX(init, i) +
`INDEX(array[63].next, i + 16);
end
endgenerate
endmodule

module miner(clk, data, mid, result, found);
input clk;
input [255:0] mid;
input [127:0] data;
output [31:0] result;
output found;

reg [31:0] nonce = 32'b0;

wire [255:0] hash1, hash;

chain sha1(clk, mid, {
32'h00000280, 32'h00000000, 32'h00000000, 32'h00000000,
32'h00000000, 32'h00000000, 32'h00000000, 32'h00000000,
32'h00000000, 32'h00000000, 32'h00000000, 32'h80000000,
result, data[95:0]
}, hash1);

chain sha2(clk, {
32'h5be0cd19, 32'h1f83d9ab, 32'h9b05688c, 32'h510e527f,
32'ha54ff53a, 32'h3c6ef372, 32'hbb67ae85, 32'h6a09e667
}, {
32'h00000100, 32'h00000000, 32'h00000000, 32'h00000000,
32'h00000000, 32'h00000000, 32'h00000000, 32'h80000000,
hash1
}, hash);

assign result = nonce + data[127:96];
assign found = (32'b0 == `INDEX(hash, 7));

always @(posedge clk)
nonce <= nonce + 1;
endmodule

`ifdef SIM
module proof;
reg clk = 1'b0;

wire [31:0] result;
wire found;

miner core(clk, {
32'h381353f9, 32'h378a0e1c, 32'hfd270c51, 32'ha7f5f990
}, {
32'h00f10dad, 32'hacc0caa8, 32'h7482c0f3, 32'ha66f356d,
32'hdb1ff3ca, 32'hfb545b91, 32'h1efebbc8, 32'h24e39e50
}, result, found);

always begin
if (result >= 32'h381353fb + 128)
$finish;

#1 clk <= ~clk;
#1 clk <= ~clk;

if (result >= 32'h381353f9 + 128)
$display("%x %x %x", result - 128, found, core.hash);
end
endmodule
`endif
Pages: [1]
  Print  
 
Jump to:  

Powered by MySQL Powered by PHP Powered by SMF 1.1.19 | SMF © 2006-2009, Simple Machines Valid XHTML 1.0! Valid CSS!