OFFICIAL CGMINER mining software thread for linux/win/osx/mips/arm/r-pi 4.11.1

d3m0n1q_733rz

Sr. Member

Offline

Activity: 378
Merit: 250

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

July 31, 2011, 06:54:14 AM

#621

Code:

;; SHA-256 for X86-64 for Linux, based off of:

; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain

; Significant re-write/optimisation and reordering by,
; Neil Kettle <mu-b@digit-labs.org>
; ~18% performance improvement

; SHA-256 CPU SSE cruncher for Bitcoin Miner

ALIGN 32
BITS 64

%define hash rdi
%define data rsi
%define init rdx

; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA	2
%define LAB_CALC_UNROLL	8

%define LAB_LOOP_UNROLL 8

extern g_4sha256_k

global CalcSha256_x64_sse4
;	CalcSha256	hash(rdi), data(rsi), init(rdx)
CalcSha256_x64_sse4:

	push	rbx

LAB_NEXT_NONCE:

	mov	rcx, 256					; 256 - rcx is # of SHA-2 rounds
;	mov	rax, 64					; 64 - rax is where we expand to

LAB_SHA:
	push	rcx
	lea	rcx, qword [data+1024]				; + 1024
	lea	r11, qword [data+256]				; + 256

LAB_CALC:
%macro	lab_calc_blk 1

	movntdqa	xmm0, [r11-(15-%1)*16]				; xmm0 = W[I-15]
	movntdqa	xmm1, [r11-(15-%1)*16]				; xmm1 = W[I-15]
	movntdqa	xmm2, [r11-(15-%1)*16]				; xmm2 = W[I-15]
	movntdqa	xmm3, [r11-(2-%1)*16]				; xmm3 = W[I-2]
	movntdqa	xmm4, [r11-(15-(%1+1))*16]			; xmm4 = W[I-15+1]
	movntdqa	xmm5, [r11-(15-(%1+1))*16]			; xmm4 = W[I-15+1]
	movntdqa	xmm6, [r11-(15-(%1+1))*16]			; xmm6 = W[I-15+1]
	movntdqa	xmm7, [r11-(2-(%1+1))*16]			; xmm7 = W[I-2+1]

;	movdqa	xmm2, xmm0					; xmm2 = W[I-15]	
;	movdqa	xmm6, xmm4					; xmm6 = W[I-15+1]	

	psrld	xmm0, 3						; xmm0 = W[I-15] >> 3
	psrld	xmm1, 7						; xmm1 = W[I-15] >> 7 Moved and made it independent of xmm0
	psrld	xmm4, 3						; xmm4 = W[I-15+1] >> 3
	psrld	xmm5, 7						; xmm5 = W[I-15+1] >> 7	
	pslld	xmm2, 14					; xmm2 = W[I-15] << 14			

;	movdqa	xmm5, xmm4					; xmm5 = W[I-15+1] >> 3
;	movdqa	xmm1, xmm0					; xmm1 = W[I-15] >> 3	

	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)	
	pslld	xmm6, 14					; xmm6 = W[I-15+1] << 14

	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
	psrld	xmm1, 11					; xmm1 = W[I-15] >> 18
	psrld	xmm5, 11					; xmm5 = W[I-15+1] >> 18
	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)	
	pslld	xmm2, 11					; xmm2 = W[I-15] << 25
	pslld	xmm6, 11					; xmm6 = W[I-15+1] << 25
	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
	paddd	xmm0, [r11-(16-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16]
	paddd	xmm4, [r11-(16-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1]


;;;;;;;;;;;;;;;;;;

	movdqa	xmm2, xmm3					; xmm2 = W[I-2]
	psrld	xmm3, 10					; xmm3 = W[I-2] >> 10
	movdqa	xmm1, xmm3					; xmm1 = W[I-2] >> 10
	movdqa	xmm6, xmm7					; xmm6 = W[I-2+1]
	psrld	xmm7, 10					; xmm7 = W[I-2+1] >> 10
	movdqa	xmm5, xmm7					; xmm5 = W[I-2+1] >> 10

	paddd	xmm0, [r11-(7-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
	paddd	xmm4, [r11-(7-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]
	
	pslld	xmm2, 13					; xmm2 = W[I-2] << 13
	pslld	xmm6, 13					; xmm6 = W[I-2+1] << 13
	psrld	xmm1, 7						; xmm1 = W[I-2] >> 17
	psrld	xmm5, 7						; xmm5 = W[I-2+1] >> 17



	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
	psrld	xmm1, 2						; xmm1 = W[I-2] >> 19
	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
	pslld	xmm2, 2						; xmm2 = W[I-2] << 15
	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
	psrld	xmm5, 2						; xmm5 = W[I-2+1] >> 19	
	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)
	pslld	xmm6, 2						; xmm6 = W[I-2+1] << 15



	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
	paddd	xmm0, xmm3					; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)	
	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
	paddd	xmm4, xmm7					; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]

	movdqa	[r11+(%1*16)], xmm0
	movdqa	[r11+((%1+1)*16)], xmm4
%endmacro

%assign i 0
%rep    LAB_CALC_UNROLL
        lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep

	add	r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
	cmp	r11, rcx
	jb	LAB_CALC

	pop	rcx
	mov	rax, 0

; Load the init values of the message into the hash.

	movntdqa	xmm7, [init]
	movntdqa	xmm0, [init+16]
	pshufd	xmm5, xmm7, 0x55		; xmm5 == b
	pshufd	xmm4, xmm7, 0xAA		; xmm4 == c
	pshufd	xmm3, xmm7, 0xFF		; xmm3 == d
	pshufd	xmm7, xmm7, 0			; xmm7 == a
	pshufd	xmm8, xmm0, 0x55		; xmm8 == f
	pshufd	xmm9, xmm0, 0xAA		; xmm9 == g
	pshufd	xmm10, xmm0, 0xFF		; xmm10 == h
	pshufd	xmm0, xmm0, 0			; xmm0 == e

LAB_LOOP:

;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]

%macro	lab_loop_blk 0				; Notice the macro! rax*4 isn't redundant here.
	movntdqa	xmm6, [data+rax*4]
	paddd	xmm6, g_4sha256_k[rax*4]
	add	rax, 4

	paddd	xmm6, xmm10	; +h

	movdqa	xmm1, xmm0
;	movdqa	xmm2, xmm9	; It's redundant unless xmm9 becomes a destination
	movdqa	xmm10, xmm9	; h = g  Changed from xmm2 to xmm9
	pandn	xmm1, xmm9	; ~e & g Changed from xmm2 to xmm9

	movdqa	xmm9, xmm8	; f
	movdqa	xmm2, xmm8	; g = f	xmm9 became a destination but not until xmm2 was already used and replaced

	pand	xmm2, xmm0	; e & f
	pxor	xmm1, xmm2	; (e & f) ^ (~e & g)
	paddd	xmm6, xmm1	; Ch + h + w[i] + k[i]

	movdqa	xmm1, xmm0
	movdqa	xmm2, xmm0
	movdqa	xmm8, xmm0	; f = e Combining these three moves for processor hardware optimization
	psrld	xmm0, 6		; The xmm2 from xmm0 move used to be after this taking advantage of the r-rotate 6
	psrld	xmm2, 11	; Changed from 5 to 11 after shoving the movdqa commands together
	pslld	xmm1, 7
	pxor	xmm0, xmm1
	pxor	xmm0, xmm2
	pslld	xmm1, 14
	psrld	xmm2, 14
	pxor	xmm0, xmm1
	pxor	xmm0, xmm2
	pslld	xmm1, 5
	pxor	xmm0, xmm1	; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
	paddd	xmm6, xmm0	; xmm6 = t1
	paddd	xmm3, xmm6	; e = d+t1

	movdqa	xmm0, xmm3	; d
	movdqa	xmm1, xmm5	; =b
	movdqa	xmm2, xmm4	; c
	movdqa	xmm3, xmm2	; d = c
	pand	xmm2, xmm5	; b & c
	pand	xmm4, xmm7	; a & c
	pand	xmm1, xmm7	; a & b
	pxor	xmm1, xmm4
	pxor	xmm1, xmm2	; (a & c) ^ (a & d) ^ (c & d)
	paddd	xmm6, xmm1	; t1 + ((a & c) ^ (a & d) ^ (c & d))

	movdqa	xmm4, xmm5	; c = b
	movdqa	xmm5, xmm7	; b = a
	movdqa	xmm2, xmm7
	movdqa	xmm1, xmm7
	psrld	xmm7, 2
	pslld	xmm2, 10
	psrld	xmm1, 13
	pxor	xmm7, xmm2
	pxor	xmm7, xmm1
	pslld	xmm2, 9
	psrld	xmm1, 9
	pxor	xmm7, xmm2
	pxor	xmm7, xmm1
	pslld	xmm2, 11
	pxor	xmm7, xmm2
	paddd	xmm7, xmm6	; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro

%assign i 0
%rep    LAB_LOOP_UNROLL
        lab_loop_blk
%assign i i+1
%endrep

	cmp	rax, rcx
	jb	LAB_LOOP

; Finished the 64 rounds, calculate hash and save

	movntdqa	xmm1, [rdx]
	pshufd	xmm2, xmm1, 0x55
	paddd	xmm5, xmm2
	pshufd	xmm6, xmm1, 0xAA
	paddd	xmm4, xmm6
	pshufd	xmm11, xmm1, 0xFF
	paddd	xmm3, xmm11
	pshufd	xmm1, xmm1, 0
	paddd	xmm7, xmm1

	movntdqa	xmm1, [rdx+16]
	pshufd	xmm2, xmm1, 0x55
	paddd	xmm8, xmm2
	pshufd	xmm6, xmm1, 0xAA
	paddd	xmm9, xmm6
	pshufd	xmm11, xmm1, 0xFF
	paddd	xmm10, xmm11
	pshufd	xmm1, xmm1, 0
	paddd	xmm0, xmm1

	movdqa	[hash], xmm7
	movdqa	[hash+16], xmm5
	movdqa	[hash+32], xmm4
	movdqa	[hash+48], xmm3
	movdqa	[hash+64], xmm0
	movdqa	[hash+80], xmm8
	movdqa	[hash+96], xmm9
	movdqa	[hash+112], xmm10

LAB_RET:
	pop	rbx
	ret

SSE4 so far. I'm taking a break to watch anime. Cheesy

The changes take advantage of write combining hardware. If you have it great, if you don't won't notice much of a change. Probably won't notice much anyway since the basic code structure is the same. Eh, oh well.
Edit: Slight slow-down in the lab-loop. I'll copy-paste the old code back in to fix it later. O_O Bleach is on!

Funroll_Loops, the theoretically quicker breakfast cereal!
Check out http://www.facebook.com/JupiterICT for all of your computing needs. If you need it, we can get it. We have solutions for your computing conundrums. BTC accepted! 12HWUSguWXRCQKfkPeJygVR1ex5wbg3hAq

zaytsev

Newbie

Offline

Activity: 59
Merit: 0

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

July 31, 2011, 09:21:26 AM

#622

d3m0n1q_733rz, why wouldn't you create a fork on github? would be easier then copy-pasting and less error-prone.

d3m0n1q_733rz

Sr. Member

Offline

Activity: 378
Merit: 250

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

July 31, 2011, 12:02:13 PM
Last edit: July 31, 2011, 12:36:44 PM by d3m0n1q_733rz

#623

Quote from: zaytsev on July 31, 2011, 09:21:26 AM

d3m0n1q_733rz, why wouldn't you create a fork on github? would be easier then copy-pasting and less error-prone.

A) I can't actually program from scratch and most of my changes or just logic based.
B) Almost nobody gives me input on how my changes affect their hashing anyway.
C) People might end up sending me incessant requests for changes that I couldn't keep up with.

Besides, this is more of a hobby for me than an outright project and I want to be able to drop it like one without people getting caught in the wake. Wink

In related news:

Code:

;; SHA-256 for X86-64 for Linux, based off of:

; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain

; Significant re-write/optimisation and reordering by,
; Neil Kettle <mu-b@digit-labs.org>
; Small modifications played around with by,
; Erick Couts II <cryo_rebirth@yahoo.com>
; ~18% performance improvement

; SHA-256 CPU SSE cruncher for Bitcoin Miner

ALIGN 32
BITS 64

%define hash rdi
%define data rsi
%define init rdx

; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA	2
%define LAB_CALC_UNROLL	8

%define LAB_LOOP_UNROLL 8

extern g_4sha256_k

global CalcSha256_x64_sse4
;	CalcSha256	hash(rdi), data(rsi), init(rdx)
CalcSha256_x64_sse4:

	push	rbx

LAB_NEXT_NONCE:

	mov	rcx, 256					; 256 - rcx is # of SHA-2 rounds
;	mov	rax, 64					; 64 - rax is where we expand to

LAB_SHA:
	push	rcx
	lea	rcx, qword [data+1024]				; + 1024
	lea	r11, qword [data+256]				; + 256

LAB_CALC:
%macro	lab_calc_blk 1
;	prefetcht0	[r11-(15-%1)*16]
;	prefetcht0	[r11-(15-(%1+1))*16]

	movntdqa	xmm0, [r11-(15-%1)*16]				; xmm0 = W[I-15]
	movdqa	xmm1, xmm0				; xmm1 = W[I-15]
	movdqa	xmm2, xmm0				; xmm2 = W[I-15]
	movntdqa	xmm3, [r11-(2-%1)*16]				; xmm3 = W[I-2]
	movntdqa	xmm4, [r11-(15-(%1+1))*16]			; xmm4 = W[I-15+1]
	movdqa	xmm5, xmm4			; xmm4 = W[I-15+1]
	movtdqa	xmm6, xmm4			; xmm6 = W[I-15+1]
	movntdqa	xmm7, [r11-(2-(%1+1))*16]			; xmm7 = W[I-2+1]

;	movdqa	xmm2, xmm0					; xmm2 = W[I-15]	
;	movdqa	xmm6, xmm4					; xmm6 = W[I-15+1]	

	psrld	xmm0, 3						; xmm0 = W[I-15] >> 3
	psrld	xmm1, 7						; xmm1 = W[I-15] >> 7 (Moved and made it independent of xmm0)
	psrld	xmm4, 3						; xmm4 = W[I-15+1] >> 3
	psrld	xmm5, 7						; xmm5 = W[I-15+1] >> 7	
	pslld	xmm2, 14					; xmm2 = W[I-15] << 14			

;	movdqa	xmm5, xmm4					; xmm5 = W[I-15+1] >> 3
;	movdqa	xmm1, xmm0					; xmm1 = W[I-15] >> 3	

	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)	
	pslld	xmm6, 14					; xmm6 = W[I-15+1] << 14

	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
	psrld	xmm1, 11					; xmm1 = W[I-15] >> 18
	psrld	xmm5, 11					; xmm5 = W[I-15+1] >> 18
	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)	
	pslld	xmm2, 11					; xmm2 = W[I-15] << 25
	pslld	xmm6, 11					; xmm6 = W[I-15+1] << 25
	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
	paddd	xmm0, [r11-(16-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16]
	paddd	xmm4, [r11-(16-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1]


;;;;;;;;;;;;;;;;;;

	movdqa	xmm2, xmm3					; xmm2 = W[I-2]
	movdqa	xmm1, xmm3					; xmm1 = W[I-2] >> 10
	movdqa	xmm6, xmm7					; xmm6 = W[I-2+1]
	movdqa	xmm5, xmm7					; xmm5 = W[I-2+1] >> 10


	paddd	xmm0, [r11-(7-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
	paddd	xmm4, [r11-(7-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]
	
	psrld	xmm1, 17					; xmm1 = W[I-2] >> 17
	pslld	xmm2, 13					; xmm2 = W[I-2] << 13
	psrld	xmm3, 10					; xmm3 = W[I-2] >> 10
	psrld	xmm5, 17					; xmm5 = W[I-2+1] >> 17
	pslld	xmm6, 13					; xmm6 = W[I-2+1] << 13
	psrld	xmm7, 10					; xmm7 = W[I-2+1] >> 10

	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)

	psrld	xmm1, 2						; xmm1 = W[I-2] >> 19
	psrld	xmm5, 2						; xmm5 = W[I-2+1] >> 19	
	pslld	xmm2, 2						; xmm2 = W[I-2] << 15
	pslld	xmm6, 2						; xmm6 = W[I-2+1] << 15

	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)	
	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
	paddd	xmm0, xmm3					; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
	paddd	xmm4, xmm7					; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]

	movdqa	[r11+((%1+1)*16)], xmm4
	movdqa	[r11+(%1*16)], xmm0
%endmacro

%assign i 0
%rep    LAB_CALC_UNROLL
        lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep
;	prefetchnta	[rcx]

	add	r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
	cmp	r11, rcx
	jb	LAB_CALC
	prefetchnta	[init+16]
	pop	rcx
	mov	rax, 0

; Load the init values of the message into the hash.

	movntdqa	xmm7, [init]
	movntdqa	xmm0, [init+16]
	pshufd	xmm5, xmm7, 0x55		; xmm5 == b
	pshufd	xmm4, xmm7, 0xAA		; xmm4 == c
	pshufd	xmm3, xmm7, 0xFF		; xmm3 == d
	pshufd	xmm7, xmm7, 0			; xmm7 == a
	pshufd	xmm8, xmm0, 0x55		; xmm8 == f
	pshufd	xmm9, xmm0, 0xAA		; xmm9 == g
	pshufd	xmm10, xmm0, 0xFF		; xmm10 == h
	pshufd	xmm0, xmm0, 0			; xmm0 == e

LAB_LOOP:

;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]

%macro	lab_loop_blk 0
;	prefetchnta	[rax*4]
	movntdqa	xmm6, [data+rax*4]
	paddd	xmm6, g_4sha256_k[rax*4]
	add	rax, 4

	paddd	xmm6, xmm10	; +h

	movdqa	xmm1, xmm0
;	movdqa	xmm2, xmm9	; It's redundant unless xmm9 becomes a destination
	movdqa	xmm10, xmm9	; h = g  Changed from xmm2 to xmm9
	movdqa	xmm9, xmm8	; f
	movdqa	xmm2, xmm8	; g = f	xmm9 became a destination but not until xmm2 was already used and replaced

	pand	xmm2, xmm0	; e & f
	pandn	xmm1, xmm10	; ~e & g Changed from xmm2 to xmm9 (see above reason) then xmm10 to combine writes
	pxor	xmm1, xmm2	; (e & f) ^ (~e & g)
	paddd	xmm6, xmm1	; Ch + h + w[i] + k[i]

	movdqa	xmm1, xmm0
	movdqa	xmm2, xmm0
	movdqa	xmm8, xmm0	; f = e Combining these three moves for processor hardware optimization
	psrld	xmm0, 6		; The xmm2 from xmm0 movdqa used to be after this taking advantage of the r-rotate 6
	psrld	xmm2, 11	; Changed from 5 to 11 after shoving the movdqa commands together
	pslld	xmm1, 7
	pxor	xmm0, xmm1
	pxor	xmm0, xmm2
	pslld	xmm1, 14
	psrld	xmm2, 14
	pxor	xmm0, xmm1
	pxor	xmm0, xmm2
	pslld	xmm1, 5
	pxor	xmm0, xmm1	; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
	paddd	xmm6, xmm0	; xmm6 = t1
	paddd	xmm3, xmm6	; e = d+t1

	movdqa	xmm0, xmm3	; d
	movdqa	xmm1, xmm5	; =b
	movdqa	xmm2, xmm4	; c
	movdqa	xmm3, xmm2	; d = c
	pand	xmm2, xmm5	; b & c
	pand	xmm4, xmm7	; a & c
	pand	xmm1, xmm7	; a & b
	pxor	xmm1, xmm4
	pxor	xmm1, xmm2	; (a & c) ^ (a & d) ^ (c & d)
	paddd	xmm6, xmm1	; t1 + ((a & c) ^ (a & d) ^ (c & d))

	movdqa	xmm4, xmm5	; c = b
	movdqa	xmm5, xmm7	; b = a
	movdqa	xmm2, xmm7
	movdqa	xmm1, xmm7
	psrld	xmm1, 13
	psrld	xmm7, 2
	pslld	xmm2, 10
	pxor	xmm7, xmm2
	pxor	xmm7, xmm1
	pslld	xmm2, 9
	psrld	xmm1, 9
	pxor	xmm7, xmm2
	pxor	xmm7, xmm1
	pslld	xmm2, 11
	pxor	xmm7, xmm2
	paddd	xmm7, xmm6	; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro

%assign i 0
%rep    LAB_LOOP_UNROLL
        lab_loop_blk
%assign i i+1
%endrep

	cmp	rax, rcx
	jb	LAB_LOOP

; Finished the 64 rounds, calculate hash and save

	movntdqa	xmm1, [rdx]
	pshufd	xmm2, xmm1, 0x55
	paddd	xmm5, xmm2
	pshufd	xmm6, xmm1, 0xAA
	paddd	xmm4, xmm6
	pshufd	xmm11, xmm1, 0xFF
	paddd	xmm3, xmm11
	pshufd	xmm1, xmm1, 0
	paddd	xmm7, xmm1

	movntdqa	xmm1, [rdx+16]
	pshufd	xmm2, xmm1, 0x55
	paddd	xmm8, xmm2
	pshufd	xmm6, xmm1, 0xAA
	paddd	xmm9, xmm6
	pshufd	xmm11, xmm1, 0xFF
	paddd	xmm10, xmm11
	pshufd	xmm1, xmm1, 0
	paddd	xmm0, xmm1

	movdqa	[hash], xmm7
	movdqa	[hash+16], xmm5
	movdqa	[hash+32], xmm4
	movdqa	[hash+48], xmm3
	movdqa	[hash+64], xmm0
	movdqa	[hash+80], xmm8
	movdqa	[hash+96], xmm9
	movdqa	[hash+112], xmm10

LAB_RET:
	pop	rbx
	ret

I've commented out some of the optimizations I've been playing around with so you can see what I've been trying. It seemed like the prefetches actually slowed the code down for me. AMD users might have different results. Here, I've taken the liberty of even supplying the AMD users with the SSE2 code for ease of use. I ended up leaving in the loop modifications I made just because I couldn't tell much difference honestly. But I'm going to bed.

Code:

;; SHA-256 for X86-64 for Linux, based off of:

; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain

; Significant re-write/optimisation and reordering by,
; Neil Kettle <mu-b@digit-labs.org>
; Small modifications played around with by,
; Erick Couts II <cryo_rebirth@yahoo.com>
; ~18% performance improvement

; SHA-256 CPU SSE cruncher for Bitcoin Miner

ALIGN 32
BITS 64

%define hash rdi
%define data rsi
%define init rdx

; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA	2
%define LAB_CALC_UNROLL	8

%define LAB_LOOP_UNROLL 8

extern g_4sha256_k

global CalcSha256_x64
;	CalcSha256	hash(rdi), data(rsi), init(rdx)
CalcSha256_x64:

	push	rbx

LAB_NEXT_NONCE:

	mov	rcx, 256					; 256 - rcx is # of SHA-2 rounds
;	mov	rax, 64					; 64 - rax is where we expand to

LAB_SHA:
	push	rcx
	lea	rcx, qword [data+1024]				; + 1024
	lea	r11, qword [data+256]				; + 256

LAB_CALC:
%macro	lab_calc_blk 1
;	prefetcht0	[r11-(15-%1)*16]
;	prefetcht0	[r11-(15-(%1+1))*16]

	movdqa	xmm0, [r11-(15-%1)*16]				; xmm0 = W[I-15]
	movdqa	xmm1, xmm0				; xmm1 = W[I-15]
	movdqa	xmm2, xmm0				; xmm2 = W[I-15]
	movdqa	xmm3, [r11-(2-%1)*16]				; xmm3 = W[I-2]
	movdqa	xmm4, [r11-(15-(%1+1))*16]			; xmm4 = W[I-15+1]
	movdqa	xmm5, xmm4			; xmm5 = W[I-15+1]
	movdqa	xmm6, xmm4			; xmm6 = W[I-15+1]
	movdqa	xmm7, [r11-(2-(%1+1))*16]			; xmm7 = W[I-2+1]

;	movdqa	xmm2, xmm0					; xmm2 = W[I-15]	
;	movdqa	xmm6, xmm4					; xmm6 = W[I-15+1]	

	psrld	xmm0, 3						; xmm0 = W[I-15] >> 3
	psrld	xmm1, 7						; xmm1 = W[I-15] >> 7 (Moved and made it independent of xmm0)
	psrld	xmm4, 3						; xmm4 = W[I-15+1] >> 3
	psrld	xmm5, 7						; xmm5 = W[I-15+1] >> 7	
	pslld	xmm2, 14					; xmm2 = W[I-15] << 14			

;	movdqa	xmm5, xmm4					; xmm5 = W[I-15+1] >> 3
;	movdqa	xmm1, xmm0					; xmm1 = W[I-15] >> 3	

	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)	
	pslld	xmm6, 14					; xmm6 = W[I-15+1] << 14

	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
	psrld	xmm1, 11					; xmm1 = W[I-15] >> 18
	psrld	xmm5, 11					; xmm5 = W[I-15+1] >> 18
	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)	
	pslld	xmm2, 11					; xmm2 = W[I-15] << 25
	pslld	xmm6, 11					; xmm6 = W[I-15+1] << 25
	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
	paddd	xmm0, [r11-(16-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16]
	paddd	xmm4, [r11-(16-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1]


;;;;;;;;;;;;;;;;;;

	movdqa	xmm2, xmm3					; xmm2 = W[I-2]
	movdqa	xmm1, xmm3					; xmm1 = W[I-2] >> 10
	movdqa	xmm6, xmm7					; xmm6 = W[I-2+1]
	movdqa	xmm5, xmm7					; xmm5 = W[I-2+1] >> 10


	paddd	xmm0, [r11-(7-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
	paddd	xmm4, [r11-(7-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]
	
	psrld	xmm1, 17					; xmm1 = W[I-2] >> 17
	psrld	xmm3, 10					; xmm3 = W[I-2] >> 10
	psrld	xmm5, 17					; xmm5 = W[I-2+1] >> 17
	psrld	xmm7, 10					; xmm7 = W[I-2+1] >> 10
	pslld	xmm2, 13					; xmm2 = W[I-2] << 13
	pslld	xmm6, 13					; xmm6 = W[I-2+1] << 13

	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)

	psrld	xmm1, 2						; xmm1 = W[I-2] >> 19
	psrld	xmm5, 2						; xmm5 = W[I-2+1] >> 19	
	pslld	xmm2, 2						; xmm2 = W[I-2] << 15
	pslld	xmm6, 2						; xmm6 = W[I-2+1] << 15

	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)	
	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
	paddd	xmm0, xmm3					; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
	paddd	xmm4, xmm7					; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]

	movdqa	[r11+((%1+1)*16)], xmm4
	movdqa	[r11+(%1*16)], xmm0
%endmacro

%assign i 0
%rep    LAB_CALC_UNROLL
        lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep
;	prefetchnta	[rcx]

	add	r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
	cmp	r11, rcx
	jb	LAB_CALC
	prefetchnta	[init+16]
	pop	rcx
	mov	rax, 0

; Load the init values of the message into the hash.

	movdqa	xmm7, [init]
	movdqa	xmm0, [init+16]
	pshufd	xmm5, xmm7, 0x55		; xmm5 == b
	pshufd	xmm4, xmm7, 0xAA		; xmm4 == c
	pshufd	xmm3, xmm7, 0xFF		; xmm3 == d
	pshufd	xmm7, xmm7, 0			; xmm7 == a
	pshufd	xmm8, xmm0, 0x55		; xmm8 == f
	pshufd	xmm9, xmm0, 0xAA		; xmm9 == g
	pshufd	xmm10, xmm0, 0xFF		; xmm10 == h
	pshufd	xmm0, xmm0, 0			; xmm0 == e

LAB_LOOP:

;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]

%macro	lab_loop_blk 0
;	prefetchnta	[rax*4]
	movdqa	xmm6, [data+rax*4]
	paddd	xmm6, g_4sha256_k[rax*4]
	add	rax, 4

	paddd	xmm6, xmm10	; +h

	movdqa	xmm1, xmm0
;	movdqa	xmm2, xmm9	; It's redundant unless xmm9 becomes a destination
	movdqa	xmm10, xmm9	; h = g  Changed from xmm2 to xmm9
	movdqa	xmm9, xmm8	; f
	movdqa	xmm2, xmm8	; g = f	xmm9 became a destination but not until xmm2 was already used and replaced

	pand	xmm2, xmm0	; e & f
	pandn	xmm1, xmm10	; ~e & g Changed from xmm2 to xmm9 (see above reason) then xmm10 to combine writes
	pxor	xmm1, xmm2	; (e & f) ^ (~e & g)
	paddd	xmm6, xmm1	; Ch + h + w[i] + k[i]

	movdqa	xmm1, xmm0
	movdqa	xmm2, xmm0
	movdqa	xmm8, xmm0	; f = e Combining these three moves for processor hardware optimization
	psrld	xmm0, 6		; The xmm2 from xmm0 movdqa used to be after this taking advantage of the r-rotate 6
	psrld	xmm2, 11	; Changed from 5 to 11 after shoving the movdqa commands together
	pslld	xmm1, 7
	pxor	xmm0, xmm1
	pxor	xmm0, xmm2
	pslld	xmm1, 14
	psrld	xmm2, 14
	pxor	xmm0, xmm1
	pxor	xmm0, xmm2
	pslld	xmm1, 5
	pxor	xmm0, xmm1	; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
	paddd	xmm6, xmm0	; xmm6 = t1
	paddd	xmm3, xmm6	; e = d+t1

	movdqa	xmm0, xmm3	; d
	movdqa	xmm1, xmm5	; =b
	movdqa	xmm2, xmm4	; c
	movdqa	xmm3, xmm2	; d = c
	pand	xmm2, xmm5	; b & c
	pand	xmm4, xmm7	; a & c
	pand	xmm1, xmm7	; a & b
	pxor	xmm1, xmm4
	pxor	xmm1, xmm2	; (a & c) ^ (a & d) ^ (c & d)
	paddd	xmm6, xmm1	; t1 + ((a & c) ^ (a & d) ^ (c & d))

	movdqa	xmm4, xmm5	; c = b
	movdqa	xmm5, xmm7	; b = a
	movdqa	xmm2, xmm7
	movdqa	xmm1, xmm7
	psrld	xmm1, 13
	psrld	xmm7, 2
	pslld	xmm2, 10
	pxor	xmm7, xmm2
	pxor	xmm7, xmm1
	pslld	xmm2, 9
	psrld	xmm1, 9
	pxor	xmm7, xmm2
	pxor	xmm7, xmm1
	pslld	xmm2, 11
	pxor	xmm7, xmm2
	paddd	xmm7, xmm6	; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro

%assign i 0
%rep    LAB_LOOP_UNROLL
        lab_loop_blk
%assign i i+1
%endrep

	cmp	rax, rcx
	jb	LAB_LOOP

; Finished the 64 rounds, calculate hash and save

	movdqa	xmm1, [rdx]
	pshufd	xmm2, xmm1, 0x55
	paddd	xmm5, xmm2
	pshufd	xmm6, xmm1, 0xAA
	paddd	xmm4, xmm6
	pshufd	xmm11, xmm1, 0xFF
	paddd	xmm3, xmm11
	pshufd	xmm1, xmm1, 0
	paddd	xmm7, xmm1

	movdqa	xmm1, [rdx+16]
	pshufd	xmm2, xmm1, 0x55
	paddd	xmm8, xmm2
	pshufd	xmm6, xmm1, 0xAA
	paddd	xmm9, xmm6
	pshufd	xmm11, xmm1, 0xFF
	paddd	xmm10, xmm11
	pshufd	xmm1, xmm1, 0
	paddd	xmm0, xmm1

	movdqa	[hash], xmm7
	movdqa	[hash+16], xmm5
	movdqa	[hash+32], xmm4
	movdqa	[hash+48], xmm3
	movdqa	[hash+64], xmm0
	movdqa	[hash+80], xmm8
	movdqa	[hash+96], xmm9
	movdqa	[hash+112], xmm10

LAB_RET:
	pop	rbx
	ret

Funroll_Loops, the theoretically quicker breakfast cereal!
Check out http://www.facebook.com/JupiterICT for all of your computing needs. If you need it, we can get it. We have solutions for your computing conundrums. BTC accepted! 12HWUSguWXRCQKfkPeJygVR1ex5wbg3hAq

zaytsev

Newbie

Offline

Activity: 59
Merit: 0

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

July 31, 2011, 12:54:29 PM

#624

The point is that with a fork on github I can see exactly what you have changed as compared to the original files and also easily download your latest changes without tedious copy-pasting from the forum. Also you can easily pull ck's changes into your branch (single command needed for that) and when it will be ready you can just give him the changed files.

A fork on github doesn't mean that you are taking the code away and starting your own project, it's just a way to easily publish your changes for others to test and submit to the original project when ready.

RudeDude

Newbie

Offline

Activity: 11
Merit: 0

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

July 31, 2011, 02:29:41 PM

#625

Quote from: d3m0n1q_733rz on July 31, 2011, 12:02:13 PM

Quote from: zaytsev on July 31, 2011, 09:21:26 AM

d3m0n1q_733rz, why wouldn't you create a fork on github? would be easier then copy-pasting and less error-prone.

A) I can't actually program from scratch and most of my changes or just logic based.
B) Almost nobody gives me input on how my changes affect their hashing anyway.
C) People might end up sending me incessant requests for changes that I couldn't keep up with.

Sorry to say but that version dropped performance by about 3-4%. There is definitely some variance in the cgminer speed reporting from moment to moment (from 6.7 to 7.4 Mh/s total for this 2 core setup) but over a ~2-3min period it seems to average out to reliable numbers.

Since I'm providing some feedback I should prolly tell you some hardware & compile details:

Code:

CFLAGS = -O3 -ffast-math -funroll-loops -mtune=native -march=native -msahf

vendor_id       : GenuineIntel
cpu family      : 6
model           : 15
model name      : Intel(R) Xeon(R) CPU            5160  @ 3.00GHz
stepping        : 11
cpu MHz         : 2992.227
cache size      : 4096 KB
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat
pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall lm constant_tsc arch_perfmon
pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm
dca lahf_lm tpr_shadow vnmi flexpriority

d3m0n1q_733rz

Sr. Member

Offline

Activity: 378
Merit: 250

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

July 31, 2011, 08:59:01 PM

#626

Quote from: RudeDude on July 31, 2011, 02:29:41 PM

Quote from: d3m0n1q_733rz on July 31, 2011, 12:02:13 PM

Quote from: zaytsev on July 31, 2011, 09:21:26 AM

d3m0n1q_733rz, why wouldn't you create a fork on github? would be easier then copy-pasting and less error-prone.

A) I can't actually program from scratch and most of my changes or just logic based.
B) Almost nobody gives me input on how my changes affect their hashing anyway.
C) People might end up sending me incessant requests for changes that I couldn't keep up with.

Sorry to say but that version dropped performance by about 3-4%. There is definitely some variance in the cgminer speed reporting from moment to moment (from 6.7 to 7.4 Mh/s total for this 2 core setup) but over a ~2-3min period it seems to average out to reliable numbers.

Since I'm providing some feedback I should prolly tell you some hardware & compile details:

Code:

CFLAGS = -O3 -ffast-math -funroll-loops -mtune=native -march=native -msahf

vendor_id       : GenuineIntel
cpu family      : 6
model           : 15
model name      : Intel(R) Xeon(R) CPU            5160  @ 3.00GHz
stepping        : 11
cpu MHz         : 2992.227
cache size      : 4096 KB
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat
pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall lm constant_tsc arch_perfmon
pebs bts rep_good aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm
dca lahf_lm tpr_shadow vnmi flexpriority

Yeah, the drop in performance is caused by a little taboo I ran into in the long hours of the night. I forgot the 3-1 rule in relation to clock cycles. In other words, I streamed together too many "expensive" commands without giving them time to complete before the next set. I traded one optimization for another that didn't work as well.

Funroll_Loops, the theoretically quicker breakfast cereal!
Check out http://www.facebook.com/JupiterICT for all of your computing needs. If you need it, we can get it. We have solutions for your computing conundrums. BTC accepted! 12HWUSguWXRCQKfkPeJygVR1ex5wbg3hAq

gigica viteazu`

Sr. Member

Offline

Activity: 458
Merit: 250

beast at work

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

July 31, 2011, 10:45:22 PM

#627

i hope Ycros is passing by, sees this and give us a 1.5.3 win32 version

d3m0n1q_733rz

Sr. Member

Offline

Activity: 378
Merit: 250

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

July 31, 2011, 11:57:04 PM

#628

Quote from: gigica viteazu` on July 31, 2011, 10:45:22 PM

i hope Ycros is passing by, sees this and give us a 1.5.3 win32 version

I'm hoping someone more versed in asm passes by, sees my crappy attempt at simplifying, optimizing and updating the code and decides to help a guy out. But I don't see that happening anytime soon. : (

Funroll_Loops, the theoretically quicker breakfast cereal!
Check out http://www.facebook.com/JupiterICT for all of your computing needs. If you need it, we can get it. We have solutions for your computing conundrums. BTC accepted! 12HWUSguWXRCQKfkPeJygVR1ex5wbg3hAq

jamesg

VIP
Legendary

Offline

Activity: 1358
Merit: 1000

AKA: gigavps

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 01:26:23 AM

#629

WOW!!!!

I moved from guiminer with phoenix to cgminer and can't believe my eyes! My stales rates have dropped from 1.3% to next to nothing and my Mh's increased 2-5Mh's per card. Thanks for the awesome miner.

Just a 1% savings in stales over time is going to make some pretty good $$$. Thanks for all of the hard work.

BTC to follow.

DBordello

Sr. Member

Offline

Activity: 349
Merit: 250

BTCPak.com - Exchange your Bitcoins for MP!

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 04:00:02 AM

#630

Quote from: ckolivas on July 30, 2011, 12:51:51 PM

New release: 1.5.3

Source:
http://ck.kolivas.org/apps/cgminer/cgminer-1.5.3.tar.bz2
http://ck.kolivas.org/apps/cgminer/cgminer-1.5.3.tar.lrz

Linux x86_64 dynamic binary:
http://ck.kolivas.org/apps/cgminer/cgminer-1.5.3-x86_64-built.tar.bz2
http://ck.kolivas.org/apps/cgminer/cgminer-1.5.3-x86_64-built.tar.lrz

Changelog:

- Significant work went into attempting to make the thread restart code robust
to identify sick threads, tag them SICK after 1 minute, then DEAD after 5
minutes of inactivity and try to restart them. Instead of re-initialising the
GPU completely, only a new cl context is created to avoid hanging the rest of
the GPUs should the dead GPU be hung irrevocably. Much thanks to sunbreak for testing various modes of failure on his 66 GPU cluster.
- Use correct application name in syslog.
- Get rid of extra line feeds.
- Use pkg-config to check for libcurl version
- Implement per-thread getwork count with proper accounting to not over-account
queued items when local work replaces it.
- Create a command queue from the program created from source which allows us
to flush the command queue in the hope it will not generate a zero sized binary
any more.
- Be more willing to get work from the backup pools if the work is simply being
queued faster than it is being retrieved.

So I'm leaving tomorrow for 10 days and I still don't have a windows binary post version 1.5.1 since I'm entirely dependent on Ycros for building one for me. If he shows up on these forums and posts a link to a windows build, consider it official. Otherwise, I'm afraid that's all I can do.

P.S. If you're wondering what a .lrz file is, that's my extreme compression format which also has extreme encryption capabilities.
http://lrzip.kolivas.org

1.5.3 appears to be hitting my backup pools WAY more often. About 25% of the blocks appear to be going to a backup pool. With 1.5.2 the backup pools didn't get hit once. Am I seeing more efficient usage? Or am I just spreading my love around?

www.BTCPak.com - Exchange your bitcoins for MP: Secure, Anonymous and Easy!

c_k

Donator
Full Member

Offline

Activity: 242
Merit: 100

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 06:15:38 AM

#631

Quote from: d3m0n1q_733rz on July 31, 2011, 03:59:50 AM

Quote from: c_k on July 31, 2011, 01:44:28 AM

Great work ckolivas!

It looks like this will become the miner of choice with all the slick features you are adding.

Could you look at adding an option for monitoring the GPU temperature and backing off when it hits a maximum value and not resuming until it hits another minimum value?

If you included this you would be negating the need to ever use anything else imo

You know, technically, that feature should be maintained by the GPU itself. But I know that ufasoft has implemented it for some reason. It's more of a safeguard against failure of the hardware's throttle. Alternatively, you could try adjusting the fan speed of your card using free software so as to increase the fan speed at higher temps. Could help to not reach that temperature.

Ah, I am guessing you aren't heavily involved in GPU mining - we already have our fans near full speed keeping our cards from within a few degrees of death, so we really need the source of the heat to cease when the affected GPU(s) temperature becomes too high and then come back in to operation when the temperature has lowered on the affected GPU(s) temperature has reached a lower level as opposed to something else.

This is what AOCLBF (Windows) and AutoMiner (Linux) do, and it is the most effective way to deal with this.

Things that could cause this to happen are usually only one of a few things:

The ambient room temperature rises due to an abnormal influence (extremely hot day) or something like that.

A miner will usually keep an eye on his machines from time to time to see if the temperature is not staying within reasonable limits in the longer term, however a degree of automation along the lines of self preservation when the machine is unattended is required.

This feature would truly be the icing on the cgminer cake imo, and would give everyone little reason to ever use any other miner

https://rfcpool.com/images/sigs/26.png

gigica viteazu`

Sr. Member

Offline

Activity: 458
Merit: 250

beast at work

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 06:44:36 AM

#632

Quote from: c_k on August 01, 2011, 06:15:38 AM

... a degree of automation along the lines of self preservation when the machine is unattended is required.

for this i`m using a batch file which control clocktweak monitoring and adjusting stuff.

c_k

Donator
Full Member

Offline

Activity: 242
Merit: 100

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 07:36:03 AM

#633

Quote from: gigica viteazu` on August 01, 2011, 06:44:36 AM

Quote from: c_k on August 01, 2011, 06:15:38 AM

... a degree of automation along the lines of self preservation when the machine is unattended is required.

for this i`m using a batch file which control clocktweak monitoring and adjusting stuff.

Unfortunately ClockTweak does not support going beyond the limits of ATI Catalyst Control Center so it is of no use to those of us who do Sad

https://rfcpool.com/images/sigs/26.png

Ali

Member

Offline

Activity: 84
Merit: 10

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 09:30:51 AM

#634

Is it somehow possible to use this miner behind a proxy (+firewall which is only open on port 80) which required authentication?

zaytsev

Newbie

Offline

Activity: 59
Merit: 0

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 09:34:40 AM

#635

Yes, anything that curl can do, this miner can do. Try to export the http_proxy variable correctly, it used to work for me.

Ali

Member

Offline

Activity: 84
Merit: 10

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 09:36:00 AM

#636

but what about the companies firewall which blocks access to any ports but 80?

zaytsev

Newbie

Offline

Activity: 59
Merit: 0

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 09:41:20 AM

#637

So what's the problem? If your proxy listening on port 80 allows for outbound connections to other ports it will work. If not, make your own proxy to listen on port 80 and chain them.

Ali

Member

Offline

Activity: 84
Merit: 10

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 09:45:20 AM

#638

Quote from: zaytsev on August 01, 2011, 09:41:20 AM

If not, make your own proxy to listen on port 80 and chain them.

How do I chain them?

burtyb

Newbie

Offline

Activity: 45
Merit: 0

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 12:30:46 PM

#639

I tried cgminer on a couple of my boxes last night and woke up to find 3/4 GPU marked as DEAD on one and 2/4 DEAD on another. Trying to restart the DEAD GPU doesnt seem to do anything on either machine (restarting the ones still running did seem to restart them OK). Both boxes have been running for weeks using poclbm without errors. Running the 1.5.3 binary on Ubuntu (3xHD5870 and HD6310).

Anyone else seeing problems with DEAD GPU that won't recover unless cgminer is restarted?

BB.

sirky

Sr. Member

Offline

Activity: 404
Merit: 250

Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

August 01, 2011, 12:39:00 PM

#640

Quote from: burtyb on August 01, 2011, 12:30:46 PM

I tried cgminer on a couple of my boxes last night and woke up to find 3/4 GPU marked as DEAD on one and 2/4 DEAD on another. Trying to restart the DEAD GPU doesnt seem to do anything on either machine (restarting the ones still running did seem to restart them OK). Both boxes have been running for weeks using poclbm without errors. Running the 1.5.3 binary on Ubuntu (3xHD5870 and HD6310).

Anyone else seeing problems with DEAD GPU that won't recover unless cgminer is restarted?

BB.

I do too, but only on my linux boxes.

	Author	Topic: OFFICIAL CGMINER mining software thread for linux/win/osx/mips/arm/r-pi 4.11.1 (Read 5805215 times)
This is a self-moderated topic. If you do not want to be moderated by the person who started this topic, create a new topic. (3 posts by 1+ user deleted.)