Bitcoin Forum
November 16, 2024, 03:42:52 AM *
News: Latest Bitcoin Core release: 28.0 [Torrent]
 
   Home   Help Search Login Register More  
Pages: « 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 [32] 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 ... 843 »
  Print  
Author Topic: OFFICIAL CGMINER mining software thread for linux/win/osx/mips/arm/r-pi 4.11.1  (Read 5805634 times)
This is a self-moderated topic. If you do not want to be moderated by the person who started this topic, create a new topic. (3 posts by 1+ user deleted.)
d3m0n1q_733rz
Sr. Member
****
Offline Offline

Activity: 378
Merit: 250



View Profile WWW
July 31, 2011, 06:54:14 AM
 #621

Code:
;; SHA-256 for X86-64 for Linux, based off of:

; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain

; Significant re-write/optimisation and reordering by,
; Neil Kettle <mu-b@digit-labs.org>
; ~18% performance improvement

; SHA-256 CPU SSE cruncher for Bitcoin Miner

ALIGN 32
BITS 64

%define hash rdi
%define data rsi
%define init rdx

; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2
%define LAB_CALC_UNROLL 8

%define LAB_LOOP_UNROLL 8

extern g_4sha256_k

global CalcSha256_x64_sse4
; CalcSha256 hash(rdi), data(rsi), init(rdx)
CalcSha256_x64_sse4:

push rbx

LAB_NEXT_NONCE:

mov rcx, 256 ; 256 - rcx is # of SHA-2 rounds
; mov rax, 64 ; 64 - rax is where we expand to

LAB_SHA:
push rcx
lea rcx, qword [data+1024] ; + 1024
lea r11, qword [data+256] ; + 256

LAB_CALC:
%macro lab_calc_blk 1

movntdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15]
movntdqa xmm1, [r11-(15-%1)*16] ; xmm1 = W[I-15]
movntdqa xmm2, [r11-(15-%1)*16] ; xmm2 = W[I-15]
movntdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2]
movntdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
movntdqa xmm5, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
movntdqa xmm6, [r11-(15-(%1+1))*16] ; xmm6 = W[I-15+1]
movntdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1]

; movdqa xmm2, xmm0 ; xmm2 = W[I-15]
; movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]

psrld xmm0, 3 ; xmm0 = W[I-15] >> 3
psrld xmm1, 7 ; xmm1 = W[I-15] >> 7 Moved and made it independent of xmm0
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3
psrld xmm5, 7 ; xmm5 = W[I-15+1] >> 7
pslld xmm2, 14 ; xmm2 = W[I-15] << 14

; movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3
; movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3

pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14

pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
pslld xmm2, 11 ; xmm2 = W[I-15] << 25
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16]
paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1]


;;;;;;;;;;;;;;;;;;

movdqa xmm2, xmm3 ; xmm2 = W[I-2]
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1]
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10

paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]

pslld xmm2, 13 ; xmm2 = W[I-2] << 13
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13
psrld xmm1, 7 ; xmm1 = W[I-2] >> 17
psrld xmm5, 7 ; xmm5 = W[I-2+1] >> 17



pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
psrld xmm1, 2 ; xmm1 = W[I-2] >> 19
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
pslld xmm2, 2 ; xmm2 = W[I-2] << 15
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15



pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]

movdqa [r11+(%1*16)], xmm0
movdqa [r11+((%1+1)*16)], xmm4
%endmacro

%assign i 0
%rep    LAB_CALC_UNROLL
        lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep

add r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
cmp r11, rcx
jb LAB_CALC

pop rcx
mov rax, 0

; Load the init values of the message into the hash.

movntdqa xmm7, [init]
movntdqa xmm0, [init+16]
pshufd xmm5, xmm7, 0x55 ; xmm5 == b
pshufd xmm4, xmm7, 0xAA ; xmm4 == c
pshufd xmm3, xmm7, 0xFF ; xmm3 == d
pshufd xmm7, xmm7, 0 ; xmm7 == a
pshufd xmm8, xmm0, 0x55 ; xmm8 == f
pshufd xmm9, xmm0, 0xAA ; xmm9 == g
pshufd xmm10, xmm0, 0xFF ; xmm10 == h
pshufd xmm0, xmm0, 0 ; xmm0 == e

LAB_LOOP:

;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]

%macro lab_loop_blk 0 ; Notice the macro! rax*4 isn't redundant here.
movntdqa xmm6, [data+rax*4]
paddd xmm6, g_4sha256_k[rax*4]
add rax, 4

paddd xmm6, xmm10 ; +h

movdqa xmm1, xmm0
; movdqa xmm2, xmm9 ; It's redundant unless xmm9 becomes a destination
movdqa xmm10, xmm9 ; h = g  Changed from xmm2 to xmm9
pandn xmm1, xmm9 ; ~e & g Changed from xmm2 to xmm9

movdqa xmm9, xmm8 ; f
movdqa xmm2, xmm8 ; g = f xmm9 became a destination but not until xmm2 was already used and replaced

pand xmm2, xmm0 ; e & f
pxor xmm1, xmm2 ; (e & f) ^ (~e & g)
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i]

movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm8, xmm0 ; f = e Combining these three moves for processor hardware optimization
psrld xmm0, 6 ; The xmm2 from xmm0 move used to be after this taking advantage of the r-rotate 6
psrld xmm2, 11 ; Changed from 5 to 11 after shoving the movdqa commands together
pslld xmm1, 7
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 14
psrld xmm2, 14
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 5
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
paddd xmm6, xmm0 ; xmm6 = t1
paddd xmm3, xmm6 ; e = d+t1

movdqa xmm0, xmm3 ; d
movdqa xmm1, xmm5 ; =b
movdqa xmm2, xmm4 ; c
movdqa xmm3, xmm2 ; d = c
pand xmm2, xmm5 ; b & c
pand xmm4, xmm7 ; a & c
pand xmm1, xmm7 ; a & b
pxor xmm1, xmm4
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d)
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d))

movdqa xmm4, xmm5 ; c = b
movdqa xmm5, xmm7 ; b = a
movdqa xmm2, xmm7
movdqa xmm1, xmm7
psrld xmm7, 2
pslld xmm2, 10
psrld xmm1, 13
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 9
psrld xmm1, 9
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 11
pxor xmm7, xmm2
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro

%assign i 0
%rep    LAB_LOOP_UNROLL
        lab_loop_blk
%assign i i+1
%endrep

cmp rax, rcx
jb LAB_LOOP

; Finished the 64 rounds, calculate hash and save

movntdqa xmm1, [rdx]
pshufd xmm2, xmm1, 0x55
paddd xmm5, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm4, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm3, xmm11
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1

movntdqa xmm1, [rdx+16]
pshufd xmm2, xmm1, 0x55
paddd xmm8, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm9, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm10, xmm11
pshufd xmm1, xmm1, 0
paddd xmm0, xmm1

movdqa [hash], xmm7
movdqa [hash+16], xmm5
movdqa [hash+32], xmm4
movdqa [hash+48], xmm3
movdqa [hash+64], xmm0
movdqa [hash+80], xmm8
movdqa [hash+96], xmm9
movdqa [hash+112], xmm10

LAB_RET:
pop rbx
ret

SSE4 so far.  I'm taking a break to watch anime.   Cheesy
The changes take advantage of write combining hardware.  If you have it great, if you don't won't notice much of a change.  Probably won't notice much anyway since the basic code structure is the same.  Eh, oh well.
Edit:  Slight slow-down in the lab-loop.  I'll copy-paste the old code back in to fix it later.  O_O  Bleach is on!

Funroll_Loops, the theoretically quicker breakfast cereal!
Check out http://www.facebook.com/JupiterICT for all of your computing needs.  If you need it, we can get it.  We have solutions for your computing conundrums.  BTC accepted!  12HWUSguWXRCQKfkPeJygVR1ex5wbg3hAq
zaytsev
Newbie
*
Offline Offline

Activity: 59
Merit: 0


View Profile
July 31, 2011, 09:21:26 AM
 #622

d3m0n1q_733rz, why wouldn't you create a fork on github? would be easier then copy-pasting and less error-prone.
d3m0n1q_733rz
Sr. Member
****
Offline Offline

Activity: 378
Merit: 250



View Profile WWW
July 31, 2011, 12:02:13 PM
Last edit: July 31, 2011, 12:36:44 PM by d3m0n1q_733rz
 #623

d3m0n1q_733rz, why wouldn't you create a fork on github? would be easier then copy-pasting and less error-prone.
A) I can't actually program from scratch and most of my changes or just logic based.
B) Almost nobody gives me input on how my changes affect their hashing anyway.
C) People might end up sending me incessant requests for changes that I couldn't keep up with.

Besides, this is more of a hobby for me than an outright project and I want to be able to drop it like one without people getting caught in the wake.   Wink

In related news:
Code:
;; SHA-256 for X86-64 for Linux, based off of:

; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain

; Significant re-write/optimisation and reordering by,
; Neil Kettle <mu-b@digit-labs.org>
; Small modifications played around with by,
; Erick Couts II <cryo_rebirth@yahoo.com>
; ~18% performance improvement

; SHA-256 CPU SSE cruncher for Bitcoin Miner

ALIGN 32
BITS 64

%define hash rdi
%define data rsi
%define init rdx

; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2
%define LAB_CALC_UNROLL 8

%define LAB_LOOP_UNROLL 8

extern g_4sha256_k

global CalcSha256_x64_sse4
; CalcSha256 hash(rdi), data(rsi), init(rdx)
CalcSha256_x64_sse4:

push rbx

LAB_NEXT_NONCE:

mov rcx, 256 ; 256 - rcx is # of SHA-2 rounds
; mov rax, 64 ; 64 - rax is where we expand to

LAB_SHA:
push rcx
lea rcx, qword [data+1024] ; + 1024
lea r11, qword [data+256] ; + 256

LAB_CALC:
%macro lab_calc_blk 1
; prefetcht0 [r11-(15-%1)*16]
; prefetcht0 [r11-(15-(%1+1))*16]

movntdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15]
movdqa xmm1, xmm0 ; xmm1 = W[I-15]
movdqa xmm2, xmm0 ; xmm2 = W[I-15]
movntdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2]
movntdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
movdqa xmm5, xmm4 ; xmm4 = W[I-15+1]
movtdqa xmm6, xmm4 ; xmm6 = W[I-15+1]
movntdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1]

; movdqa xmm2, xmm0 ; xmm2 = W[I-15]
; movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]

psrld xmm0, 3 ; xmm0 = W[I-15] >> 3
psrld xmm1, 7 ; xmm1 = W[I-15] >> 7 (Moved and made it independent of xmm0)
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3
psrld xmm5, 7 ; xmm5 = W[I-15+1] >> 7
pslld xmm2, 14 ; xmm2 = W[I-15] << 14

; movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3
; movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3

pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14

pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
pslld xmm2, 11 ; xmm2 = W[I-15] << 25
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16]
paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1]


;;;;;;;;;;;;;;;;;;

movdqa xmm2, xmm3 ; xmm2 = W[I-2]
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1]
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10


paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]

psrld xmm1, 17 ; xmm1 = W[I-2] >> 17
pslld xmm2, 13 ; xmm2 = W[I-2] << 13
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10
psrld xmm5, 17 ; xmm5 = W[I-2+1] >> 17
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10

pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)

psrld xmm1, 2 ; xmm1 = W[I-2] >> 19
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19
pslld xmm2, 2 ; xmm2 = W[I-2] << 15
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15

pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]

movdqa [r11+((%1+1)*16)], xmm4
movdqa [r11+(%1*16)], xmm0
%endmacro

%assign i 0
%rep    LAB_CALC_UNROLL
        lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep
; prefetchnta [rcx]

add r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
cmp r11, rcx
jb LAB_CALC
prefetchnta [init+16]
pop rcx
mov rax, 0

; Load the init values of the message into the hash.

movntdqa xmm7, [init]
movntdqa xmm0, [init+16]
pshufd xmm5, xmm7, 0x55 ; xmm5 == b
pshufd xmm4, xmm7, 0xAA ; xmm4 == c
pshufd xmm3, xmm7, 0xFF ; xmm3 == d
pshufd xmm7, xmm7, 0 ; xmm7 == a
pshufd xmm8, xmm0, 0x55 ; xmm8 == f
pshufd xmm9, xmm0, 0xAA ; xmm9 == g
pshufd xmm10, xmm0, 0xFF ; xmm10 == h
pshufd xmm0, xmm0, 0 ; xmm0 == e

LAB_LOOP:

;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]

%macro lab_loop_blk 0
; prefetchnta [rax*4]
movntdqa xmm6, [data+rax*4]
paddd xmm6, g_4sha256_k[rax*4]
add rax, 4

paddd xmm6, xmm10 ; +h

movdqa xmm1, xmm0
; movdqa xmm2, xmm9 ; It's redundant unless xmm9 becomes a destination
movdqa xmm10, xmm9 ; h = g  Changed from xmm2 to xmm9
movdqa xmm9, xmm8 ; f
movdqa xmm2, xmm8 ; g = f xmm9 became a destination but not until xmm2 was already used and replaced

pand xmm2, xmm0 ; e & f
pandn xmm1, xmm10 ; ~e & g Changed from xmm2 to xmm9 (see above reason) then xmm10 to combine writes
pxor xmm1, xmm2 ; (e & f) ^ (~e & g)
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i]

movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm8, xmm0 ; f = e Combining these three moves for processor hardware optimization
psrld xmm0, 6 ; The xmm2 from xmm0 movdqa used to be after this taking advantage of the r-rotate 6
psrld xmm2, 11 ; Changed from 5 to 11 after shoving the movdqa commands together
pslld xmm1, 7
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 14
psrld xmm2, 14
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 5
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
paddd xmm6, xmm0 ; xmm6 = t1
paddd xmm3, xmm6 ; e = d+t1

movdqa xmm0, xmm3 ; d
movdqa xmm1, xmm5 ; =b
movdqa xmm2, xmm4 ; c
movdqa xmm3, xmm2 ; d = c
pand xmm2, xmm5 ; b & c
pand xmm4, xmm7 ; a & c
pand xmm1, xmm7 ; a & b
pxor xmm1, xmm4
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d)
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d))

movdqa xmm4, xmm5 ; c = b
movdqa xmm5, xmm7 ; b = a
movdqa xmm2, xmm7
movdqa xmm1, xmm7
psrld xmm1, 13
psrld xmm7, 2
pslld xmm2, 10
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 9
psrld xmm1, 9
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 11
pxor xmm7, xmm2
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro

%assign i 0
%rep    LAB_LOOP_UNROLL
        lab_loop_blk
%assign i i+1
%endrep

cmp rax, rcx
jb LAB_LOOP

; Finished the 64 rounds, calculate hash and save

movntdqa xmm1, [rdx]
pshufd xmm2, xmm1, 0x55
paddd xmm5, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm4, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm3, xmm11
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1

movntdqa xmm1, [rdx+16]
pshufd xmm2, xmm1, 0x55
paddd xmm8, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm9, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm10, xmm11
pshufd xmm1, xmm1, 0
paddd xmm0, xmm1

movdqa [hash], xmm7
movdqa [hash+16], xmm5
movdqa [hash+32], xmm4
movdqa [hash+48], xmm3
movdqa [hash+64], xmm0
movdqa [hash+80], xmm8
movdqa [hash+96], xmm9
movdqa [hash+112], xmm10

LAB_RET:
pop rbx
ret
I've commented out some of the optimizations I've been playing around with so you can see what I've been trying.  It seemed like the prefetches actually slowed the code down for me.  AMD users might have different results.  Here, I've taken the liberty of even supplying the AMD users with the SSE2 code for ease of use.  I ended up leaving in the loop modifications I made just because I couldn't tell much difference honestly.  But I'm going to bed.
Code:
;; SHA-256 for X86-64 for Linux, based off of:

; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain

; Significant re-write/optimisation and reordering by,
; Neil Kettle <mu-b@digit-labs.org>
; Small modifications played around with by,
; Erick Couts II <cryo_rebirth@yahoo.com>
; ~18% performance improvement

; SHA-256 CPU SSE cruncher for Bitcoin Miner

ALIGN 32
BITS 64

%define hash rdi
%define data rsi
%define init rdx

; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2
%define LAB_CALC_UNROLL 8

%define LAB_LOOP_UNROLL 8

extern g_4sha256_k

global CalcSha256_x64
; CalcSha256 hash(rdi), data(rsi), init(rdx)
CalcSha256_x64:

push rbx

LAB_NEXT_NONCE:

mov rcx, 256 ; 256 - rcx is # of SHA-2 rounds
; mov rax, 64 ; 64 - rax is where we expand to

LAB_SHA:
push rcx
lea rcx, qword [data+1024] ; + 1024
lea r11, qword [data+256] ; + 256

LAB_CALC:
%macro lab_calc_blk 1
; prefetcht0 [r11-(15-%1)*16]
; prefetcht0 [r11-(15-(%1+1))*16]

movdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15]
movdqa xmm1, xmm0 ; xmm1 = W[I-15]
movdqa xmm2, xmm0 ; xmm2 = W[I-15]
movdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2]
movdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
movdqa xmm5, xmm4 ; xmm5 = W[I-15+1]
movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]
movdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1]

; movdqa xmm2, xmm0 ; xmm2 = W[I-15]
; movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]

psrld xmm0, 3 ; xmm0 = W[I-15] >> 3
psrld xmm1, 7 ; xmm1 = W[I-15] >> 7 (Moved and made it independent of xmm0)
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3
psrld xmm5, 7 ; xmm5 = W[I-15+1] >> 7
pslld xmm2, 14 ; xmm2 = W[I-15] << 14

; movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3
; movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3

pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14

pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
pslld xmm2, 11 ; xmm2 = W[I-15] << 25
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16]
paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1]


;;;;;;;;;;;;;;;;;;

movdqa xmm2, xmm3 ; xmm2 = W[I-2]
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1]
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10


paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]

psrld xmm1, 17 ; xmm1 = W[I-2] >> 17
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10
psrld xmm5, 17 ; xmm5 = W[I-2+1] >> 17
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10
pslld xmm2, 13 ; xmm2 = W[I-2] << 13
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13

pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)

psrld xmm1, 2 ; xmm1 = W[I-2] >> 19
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19
pslld xmm2, 2 ; xmm2 = W[I-2] << 15
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15

pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]

movdqa [r11+((%1+1)*16)], xmm4
movdqa [r11+(%1*16)], xmm0
%endmacro

%assign i 0
%rep    LAB_CALC_UNROLL
        lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep
; prefetchnta [rcx]

add r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
cmp r11, rcx
jb LAB_CALC
prefetchnta [init+16]
pop rcx
mov rax, 0

; Load the init values of the message into the hash.

movdqa xmm7, [init]
movdqa xmm0, [init+16]
pshufd xmm5, xmm7, 0x55 ; xmm5 == b
pshufd xmm4, xmm7, 0xAA ; xmm4 == c
pshufd xmm3, xmm7, 0xFF ; xmm3 == d
pshufd xmm7, xmm7, 0 ; xmm7 == a
pshufd xmm8, xmm0, 0x55 ; xmm8 == f
pshufd xmm9, xmm0, 0xAA ; xmm9 == g
pshufd xmm10, xmm0, 0xFF ; xmm10 == h
pshufd xmm0, xmm0, 0 ; xmm0 == e

LAB_LOOP:

;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]

%macro lab_loop_blk 0
; prefetchnta [rax*4]
movdqa xmm6, [data+rax*4]
paddd xmm6, g_4sha256_k[rax*4]
add rax, 4

paddd xmm6, xmm10 ; +h

movdqa xmm1, xmm0
; movdqa xmm2, xmm9 ; It's redundant unless xmm9 becomes a destination
movdqa xmm10, xmm9 ; h = g  Changed from xmm2 to xmm9
movdqa xmm9, xmm8 ; f
movdqa xmm2, xmm8 ; g = f xmm9 became a destination but not until xmm2 was already used and replaced

pand xmm2, xmm0 ; e & f
pandn xmm1, xmm10 ; ~e & g Changed from xmm2 to xmm9 (see above reason) then xmm10 to combine writes
pxor xmm1, xmm2 ; (e & f) ^ (~e & g)
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i]

movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm8, xmm0 ; f = e Combining these three moves for processor hardware optimization
psrld xmm0, 6 ; The xmm2 from xmm0 movdqa used to be after this taking advantage of the r-rotate 6
psrld xmm2, 11 ; Changed from 5 to 11 after shoving the movdqa commands together
pslld xmm1, 7
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 14
psrld xmm2, 14
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 5
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
paddd xmm6, xmm0 ; xmm6 = t1
paddd xmm3, xmm6 ; e = d+t1

movdqa xmm0, xmm3 ; d
movdqa xmm1, xmm5 ; =b
movdqa xmm2, xmm4 ; c
movdqa xmm3, xmm2 ; d = c
pand xmm2, xmm5 ; b & c
pand xmm4, xmm7 ; a & c
pand xmm1, xmm7 ; a & b
pxor xmm1, xmm4
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d)
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d))

movdqa xmm4, xmm5 ; c = b
movdqa xmm5, xmm7 ; b = a
movdqa xmm2, xmm7
movdqa xmm1, xmm7
psrld xmm1, 13
psrld xmm7, 2
pslld xmm2, 10
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 9
psrld xmm1, 9
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 11
pxor xmm7, xmm2
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro

%assign i 0
%rep    LAB_LOOP_UNROLL
        lab_loop_blk
%assign i i+1
%endrep

cmp rax, rcx
jb LAB_LOOP

; Finished the 64 rounds, calculate hash and save

movdqa xmm1, [rdx]
pshufd xmm2, xmm1, 0x55
paddd xmm5, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm4, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm3, xmm11
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1

movdqa xmm1, [rdx+16]
pshufd xmm2, xmm1, 0x55
paddd xmm8, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm9, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm10, xmm11
pshufd xmm1, xmm1, 0
paddd xmm0, xmm1

movdqa [hash], xmm7
movdqa [hash+16], xmm5
movdqa [hash+32], xmm4
movdqa [hash+48], xmm3
movdqa [hash+64], xmm0
movdqa [hash+80], xmm8
movdqa [hash+96], xmm9
movdqa [hash+112], xmm10

LAB_RET:
pop rbx
ret

Funroll_Loops, the theoretically quicker breakfast cereal!
Check out http://www.facebook.com/JupiterICT for all of your computing needs.  If you need it, we can get it.  We have solutions for your computing conundrums.  BTC accepted!  12HWUSguWXRCQKfkPeJygVR1ex5wbg3hAq
zaytsev
Newbie
*
Offline Offline

Activity: 59
Merit: 0


View Profile
July 31, 2011, 12:54:29 PM
 #624

The point is that with a fork on github I can see exactly what you have changed as compared to the original files and also easily download your latest changes without tedious copy-pasting from the forum. Also you can easily pull ck's changes into your branch (single command needed for that) and when it will be ready you can just give him the changed files.

A fork on github doesn't mean that you are taking the code away and starting your own project, it's just a way to easily publish your changes for others to test and submit to the original project when ready.
RudeDude
Newbie
*
Offline Offline

Activity: 11
Merit: 0


View Profile
July 31, 2011, 02:29:41 PM
 #625

d3m0n1q_733rz, why wouldn't you create a fork on github? would be easier then copy-pasting and less error-prone.
A) I can't actually program from scratch and most of my changes or just logic based.
B) Almost nobody gives me input on how my changes affect their hashing anyway.
C) People might end up sending me incessant requests for changes that I couldn't keep up with.

Sorry to say but that version dropped performance by about 3-4%. There is definitely some variance in the cgminer speed reporting from moment to moment (from 6.7 to 7.4 Mh/s total for this 2 core setup) but over a ~2-3min period it seems to average out to reliable numbers.

Since I'm providing some feedback I should prolly tell you some hardware & compile details:
Code:
CFLAGS = -O3 -ffast-math -funroll-loops -mtune=native -march=native -msahf

vendor_id       : GenuineIntel
cpu family      : 6
model           : 15
model name      : Intel(R) Xeon(R) CPU            5160  @ 3.00GHz
stepping        : 11
cpu MHz         : 2992.227
cache size      : 4096 KB
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat
pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall lm constant_tsc arch_perfmon
pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm
dca lahf_lm tpr_shadow vnmi flexpriority
d3m0n1q_733rz
Sr. Member
****
Offline Offline

Activity: 378
Merit: 250



View Profile WWW
July 31, 2011, 08:59:01 PM
 #626

d3m0n1q_733rz, why wouldn't you create a fork on github? would be easier then copy-pasting and less error-prone.
A) I can't actually program from scratch and most of my changes or just logic based.
B) Almost nobody gives me input on how my changes affect their hashing anyway.
C) People might end up sending me incessant requests for changes that I couldn't keep up with.

Sorry to say but that version dropped performance by about 3-4%. There is definitely some variance in the cgminer speed reporting from moment to moment (from 6.7 to 7.4 Mh/s total for this 2 core setup) but over a ~2-3min period it seems to average out to reliable numbers.

Since I'm providing some feedback I should prolly tell you some hardware & compile details:
Code:
CFLAGS = -O3 -ffast-math -funroll-loops -mtune=native -march=native -msahf

vendor_id       : GenuineIntel
cpu family      : 6
model           : 15
model name      : Intel(R) Xeon(R) CPU            5160  @ 3.00GHz
stepping        : 11
cpu MHz         : 2992.227
cache size      : 4096 KB
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat
pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall lm constant_tsc arch_perfmon
pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm
dca lahf_lm tpr_shadow vnmi flexpriority
Yeah, the drop in performance is caused by a little taboo I ran into in the long hours of the night.  I forgot the 3-1 rule in relation to clock cycles.  In other words, I streamed together too many "expensive" commands without giving them time to complete before the next set.  I traded one optimization for another that didn't work as well.

Funroll_Loops, the theoretically quicker breakfast cereal!
Check out http://www.facebook.com/JupiterICT for all of your computing needs.  If you need it, we can get it.  We have solutions for your computing conundrums.  BTC accepted!  12HWUSguWXRCQKfkPeJygVR1ex5wbg3hAq
gigica viteazu`
Sr. Member
****
Offline Offline

Activity: 458
Merit: 250

beast at work


View Profile
July 31, 2011, 10:45:22 PM
 #627

i hope Ycros is passing by, sees this and give us a 1.5.3 win32 version
d3m0n1q_733rz
Sr. Member
****
Offline Offline

Activity: 378
Merit: 250



View Profile WWW
July 31, 2011, 11:57:04 PM
 #628

i hope Ycros is passing by, sees this and give us a 1.5.3 win32 version
I'm hoping someone more versed in asm passes by, sees my crappy attempt at simplifying, optimizing and updating the code and decides to help a guy out.  But I don't see that happening anytime soon.   : (

Funroll_Loops, the theoretically quicker breakfast cereal!
Check out http://www.facebook.com/JupiterICT for all of your computing needs.  If you need it, we can get it.  We have solutions for your computing conundrums.  BTC accepted!  12HWUSguWXRCQKfkPeJygVR1ex5wbg3hAq
jamesg
VIP
Legendary
*
Offline Offline

Activity: 1358
Merit: 1000


AKA: gigavps


View Profile
August 01, 2011, 01:26:23 AM
 #629

WOW!!!!

I moved from guiminer with phoenix to cgminer and can't believe my eyes! My stales rates have dropped from 1.3% to next to nothing and my Mh's increased 2-5Mh's per card. Thanks for the awesome miner.

Just a 1% savings in stales over time is going to make some pretty good $$$. Thanks for all of the hard work.

BTC to follow.
DBordello
Sr. Member
****
Offline Offline

Activity: 349
Merit: 250


BTCPak.com - Exchange your Bitcoins for MP!


View Profile WWW
August 01, 2011, 04:00:02 AM
 #630

New release: 1.5.3

Source:
http://ck.kolivas.org/apps/cgminer/cgminer-1.5.3.tar.bz2
http://ck.kolivas.org/apps/cgminer/cgminer-1.5.3.tar.lrz

Linux x86_64 dynamic binary:
http://ck.kolivas.org/apps/cgminer/cgminer-1.5.3-x86_64-built.tar.bz2
http://ck.kolivas.org/apps/cgminer/cgminer-1.5.3-x86_64-built.tar.lrz

Changelog:

- Significant work went into attempting to make the thread restart code robust
to identify sick threads, tag them SICK after 1 minute, then DEAD after 5
minutes of inactivity and try to restart them. Instead of re-initialising the
GPU completely, only a new cl context is created to avoid hanging the rest of
the GPUs should the dead GPU be hung irrevocably. Much thanks to sunbreak for testing various modes of failure on his 66 GPU cluster.
- Use correct application name in syslog.
- Get rid of extra line feeds.
- Use pkg-config to check for libcurl version
- Implement per-thread getwork count with proper accounting to not over-account
queued items when local work replaces it.
- Create a command queue from the program created from source which allows us
to flush the command queue in the hope it will not generate a zero sized binary
any more.
- Be more willing to get work from the backup pools if the work is simply being
queued faster than it is being retrieved.


So I'm leaving tomorrow for 10 days and I still don't have a windows binary post version 1.5.1 since I'm entirely dependent on Ycros for building one for me. If he shows up on these forums and posts a link to a windows build, consider it official. Otherwise, I'm afraid that's all I can do.

P.S. If you're wondering what a .lrz file is, that's my extreme compression format which also has extreme encryption capabilities.
http://lrzip.kolivas.org


1.5.3 appears to be hitting my backup pools WAY more often.  About 25% of the blocks appear to be going to a backup pool.  With 1.5.2 the backup pools didn't get hit once.  Am I seeing more efficient usage?  Or am I just spreading my love around?

www.BTCPak.com - Exchange your bitcoins for MP: Secure, Anonymous and Easy!
c_k
Donator
Full Member
*
Offline Offline

Activity: 242
Merit: 100



View Profile
August 01, 2011, 06:15:38 AM
 #631

Great work ckolivas!

It looks like this will become the miner of choice with all the slick features you are adding.

Could you look at adding an option for monitoring the GPU temperature and backing off when it hits a maximum value and not resuming until it hits another minimum value?

If you included this you would be negating the need to ever use anything else imo Smiley
You know, technically, that feature should be maintained by the GPU itself.  But I know that ufasoft has implemented it for some reason.  It's more of a safeguard against failure of the hardware's throttle.  Alternatively, you could try adjusting the fan speed of your card using free software so as to increase the fan speed at higher temps.  Could help to not reach that temperature.

Ah, I am guessing you aren't heavily involved in GPU mining - we already have our fans near full speed keeping our cards from within a few degrees of death, so we really need the source of the heat to cease when the affected GPU(s) temperature becomes too high and then come back in to operation when the temperature has lowered on the affected GPU(s) temperature has reached a lower level as opposed to something else.

This is what AOCLBF (Windows) and AutoMiner (Linux) do, and it is the most effective way to deal with this.

Things that could cause this to happen are usually only one of a few things:

The ambient room temperature rises due to an abnormal influence (extremely hot day) or something like that.

A miner will usually keep an eye on his machines from time to time to see if the temperature is not staying within reasonable limits in the longer term, however a degree of automation along the lines of self preservation when the machine is unattended is required.

This feature would truly be the icing on the cgminer cake imo, and would give everyone little reason to ever use any other miner Smiley

gigica viteazu`
Sr. Member
****
Offline Offline

Activity: 458
Merit: 250

beast at work


View Profile
August 01, 2011, 06:44:36 AM
 #632

... a degree of automation along the lines of self preservation when the machine is unattended is required.

for this i`m using a batch file which control clocktweak monitoring and adjusting stuff.
c_k
Donator
Full Member
*
Offline Offline

Activity: 242
Merit: 100



View Profile
August 01, 2011, 07:36:03 AM
 #633

... a degree of automation along the lines of self preservation when the machine is unattended is required.

for this i`m using a batch file which control clocktweak monitoring and adjusting stuff.

Unfortunately ClockTweak does not support going beyond the limits of ATI Catalyst Control Center so it is of no use to those of us who do  Sad

Ali
Member
**
Offline Offline

Activity: 84
Merit: 10


View Profile
August 01, 2011, 09:30:51 AM
 #634

Is it somehow possible to use this miner behind a proxy (+firewall which is only open on port 80) which required authentication?
zaytsev
Newbie
*
Offline Offline

Activity: 59
Merit: 0


View Profile
August 01, 2011, 09:34:40 AM
 #635

Yes, anything that curl can do, this miner can do. Try to export the http_proxy variable correctly, it used to work for me.
Ali
Member
**
Offline Offline

Activity: 84
Merit: 10


View Profile
August 01, 2011, 09:36:00 AM
 #636

but what about the companies firewall which blocks access to any ports but 80?
zaytsev
Newbie
*
Offline Offline

Activity: 59
Merit: 0


View Profile
August 01, 2011, 09:41:20 AM
 #637

So what's the problem? If your proxy listening on port 80 allows for outbound connections to other ports it will work. If not, make your own proxy to listen on port 80 and chain them.
Ali
Member
**
Offline Offline

Activity: 84
Merit: 10


View Profile
August 01, 2011, 09:45:20 AM
 #638

If not, make your own proxy to listen on port 80 and chain them.

How do I chain them?
burtyb
Newbie
*
Offline Offline

Activity: 45
Merit: 0



View Profile WWW
August 01, 2011, 12:30:46 PM
 #639

I tried cgminer on a couple of my boxes last night and woke up to find 3/4 GPU marked as DEAD on one and 2/4 DEAD on another. Trying to restart the DEAD GPU doesnt seem to do anything on either machine (restarting the ones still running did seem to restart them OK). Both boxes have been running for weeks using poclbm without errors. Running the 1.5.3 binary on Ubuntu (3xHD5870 and HD6310).

Anyone else seeing problems with DEAD GPU that won't recover unless cgminer is restarted?

BB.
sirky
Sr. Member
****
Offline Offline

Activity: 404
Merit: 250



View Profile
August 01, 2011, 12:39:00 PM
 #640

I tried cgminer on a couple of my boxes last night and woke up to find 3/4 GPU marked as DEAD on one and 2/4 DEAD on another. Trying to restart the DEAD GPU doesnt seem to do anything on either machine (restarting the ones still running did seem to restart them OK). Both boxes have been running for weeks using poclbm without errors. Running the 1.5.3 binary on Ubuntu (3xHD5870 and HD6310).

Anyone else seeing problems with DEAD GPU that won't recover unless cgminer is restarted?

BB.

I do too, but only on my linux boxes.
Pages: « 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 [32] 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 ... 843 »
  Print  
 
Jump to:  

Powered by MySQL Powered by PHP Powered by SMF 1.1.19 | SMF © 2006-2009, Simple Machines Valid XHTML 1.0! Valid CSS!