NVIDIA Kepler (K20) from 134MHash/s to 330MHash/s with CUDA

Hi,

Summary of informations from this thread:

Reposetory with my changes for rpcminer-mod (only Cuda): https://github.com/psychocoderHPC/rpcminer-mod
Reposetory with all changes and windows support (CUDA und OpenCL) (administrated by charliemaggot): https://github.com/cdmackie/rpcminer-mod

Known CUDA Errors unter Windows:
- cudart32_50.dll or cudart32_50.dll is missing -> install https://developer.nvidia.com/cuda-downloads to solve this problem
- bitcoinminercuda.cpp:174 crash -> this means that the kernel run longer than windows allow, to solv this add the paramter -gpugrid 256 or other number to the parameters

Original Post:

I have changed the code of rpcminer-mod https://github.com/Ang3lus/rpcminer-mod a little bit thus we get better performance on Kepler GPUs.
First, since cuda 5.0 we have a rotated function inside of the ptx (parallel asm), we must not add this by hand because the compiler find it automaticly.

Example: (((x ) >> bits) | (x << (32 - bits))) is compiled to ptx command shf.l.wrap.b32

Note: The changes are not comatible with the opencl version, I only change the cuda source. In CMAKEList.txt is hard coded that sm_35 (Kepler code) is created.

Run new code with this parameter: -gpu=0 -aggression=8 -gpugrid=2048 -gputhreads=256

To install the patch in your code goto root of the project and run $ patch -p1 < patch.txt

Now you get over 300MHash/s out of a Kepler GPU, I think that GTX Kepler GPUs are faster than K20 HPC version. It can be that you must use sm_30 for GTX Kepler GPUs.

[EDIT:] for 330MHash/s the GPU needs 138 Watt power.

psychocoder

patch.txt

Code:

diff -Naur ./cmake-rpcminer/CMakeLists.txt ../../rpcminer-cuda_svn//cmake-rpcminer/CMakeLists.txt
--- ./cmake-rpcminer/CMakeLists.txt	2013-01-28 19:27:46.000000000 +0100
+++ ../../rpcminer-cuda_svn//cmake-rpcminer/CMakeLists.txt	2013-04-01 11:22:19.000000000 +0200
@@ -32,6 +32,10 @@
 
 IF(BITCOIN_ENABLE_CUDA)
 	ADD_DEFINITIONS(-D_BITCOIN_MINER_CUDA_)
+	#SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_35,code=sm_35)
+	SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_35,code=sm_35  -Xptxas=-v -Xopencc=-LIST:source=on)
+	#SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_13,code=sm_13 -Xptxas=-v -Xopencc=-LIST:source=on)
+
 	CUDA_ADD_EXECUTABLE(rpcminer ${BITCOIN_RPC_MINER_SRC} ${BITCOIN_RPC_MINER_CUDA_SRC})
 	# Install generated PTX CUDA module
 	INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/${generated_file_basename}.ptx" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/rpcminer-cuda" RENAME "bitcoinminercuda.ptx")
diff -Naur ./patch.txt ../../rpcminer-cuda_svn//patch.txt
--- ./patch.txt	2013-04-01 11:30:47.000000000 +0200
+++ ../../rpcminer-cuda_svn//patch.txt	1970-01-01 01:00:00.000000000 +0100
@@ -1,14 +0,0 @@
-diff -Naur ./cmake-rpcminer/CMakeLists.txt ../../rpcminer-cuda_svn//cmake-rpcminer/CMakeLists.txt
---- ./cmake-rpcminer/CMakeLists.txt	2013-01-28 19:27:46.000000000 +0100
-+++ ../../rpcminer-cuda_svn//cmake-rpcminer/CMakeLists.txt	2013-04-01 11:22:19.000000000 +0200
-@@ -32,6 +32,10 @@
- 
- IF(BITCOIN_ENABLE_CUDA)
- 	ADD_DEFINITIONS(-D_BITCOIN_MINER_CUDA_)
-+	#SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_35,code=sm_35)
-+	SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_35,code=sm_35  -Xptxas=-v -Xopencc=-LIST:source=on)
-+	#SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_13,code=sm_13 -Xptxas=-v -Xopencc=-LIST:source=on)
-+
- 	CUDA_ADD_EXECUTABLE(rpcminer ${BITCOIN_RPC_MINER_SRC} ${BITCOIN_RPC_MINER_CUDA_SRC})
- 	# Install generated PTX CUDA module
- 	INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/${generated_file_basename}.ptx" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/rpcminer-cuda" RENAME "bitcoinminercuda.ptx")
diff -Naur ./src/cuda/bitcoinminercuda.cpp ../../rpcminer-cuda_svn//src/cuda/bitcoinminercuda.cpp
--- ./src/cuda/bitcoinminercuda.cpp	2013-01-28 19:27:46.000000000 +0100
+++ ../../rpcminer-cuda_svn//src/cuda/bitcoinminercuda.cpp	2013-04-01 11:22:30.000000000 +0200
@@ -290,6 +290,8 @@
 	{
 		AllocateResources(m_numb,m_numt);
 	}
+	m_out[0].m_bestnonce=0;
+	cuMemcpyHtoD(m_devout,m_out,/*m_numb*m_numt*/sizeof(cuda_out));
 
 	cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in));
 
@@ -319,11 +321,11 @@
 	cuFuncSetBlockShape(m_function,m_numt,1,1);
 	cuLaunchGrid(m_function,m_numb,1);
 
-	cuMemcpyDtoH(m_out,m_devout,m_numb*m_numt*sizeof(cuda_out));
+	cuMemcpyDtoH(m_out,m_devout,/*m_numb*m_numt*/sizeof(cuda_out));
 
 	// very unlikely that we will find more than 1 hash with H=0
 	// so we'll just return the first one and not even worry about G
-	for(int i=0; i<m_numb*m_numt; i++)
+	for(int i=0; i<1/*m_numb*m_numt*/; i++)
 	{
 		if(m_out[i].m_bestnonce!=0)// && m_out[i].m_bestg<bestg)
 		{
diff -Naur ./src/cuda/bitcoinminercuda.cu ../../rpcminer-cuda_svn//src/cuda/bitcoinminercuda.cu
--- ./src/cuda/bitcoinminercuda.cu	2013-01-28 19:27:46.000000000 +0100
+++ ../../rpcminer-cuda_svn//src/cuda/bitcoinminercuda.cu	2013-04-01 11:22:30.000000000 +0200
@@ -18,20 +18,31 @@
 
 #include "cudashared.h"
 
-#define byteswap(x) (((x>>24) & 0x000000ff) | ((x>>8) & 0x0000ff00) | ((x<<8) & 0x00ff0000) | ((x<<24) & 0xff000000))
-#define rotateright(x,bits) (((x & 0xffffffff) >> bits) | (x << (32 - bits)))
-#define R(x) (work[x] = (rotateright(work[x-2],17)^rotateright(work[x-2],19)^((work[x-2]&0xffffffff)>>10)) + work[x -  7] + (rotateright(work[x-15],7)^rotateright(work[x-15],18)^((work[x-15]&0xffffffff)>>3)) + work[x - 16])
+#define rotateright(x,bits) (((x ) >> bits) | (x << (32 - bits)))
+
+#define R(x) (work[x] = (rotateright(work[x-2],17)^rotateright(work[x-2],19)^((work[x-2])>>10)) + work[x -  7] + (rotateright(work[x-15],7)^rotateright(work[x-15],18)^((work[x-15])>>3)) + work[x - 16])
+
 #define sharound(a,b,c,d,e,f,g,h,x,K) {t1=h+(rotateright(e,6)^rotateright(e,11)^rotateright(e,25))+(g^(e&(f^g)))+K+x; t2=(rotateright(a,2)^rotateright(a,13)^rotateright(a,22))+((a&b)|(c&(a|b))); d+=t1; h=t1+t2;}
 
-extern "C" __global__ void cuda_process(cuda_in *in, cuda_out *out, const unsigned int loops, const unsigned int bits)
+extern "C" __global__ void cuda_process(cuda_in __restrict__ *in, cuda_out __restrict__ *out, const unsigned int loops, const unsigned int bits)
 {
+  /*variable to check if any other block has a solution*/ 
+  __shared__ unsigned int canExit;
+  if(threadIdx.x==0)
+	canExit=out[0].m_bestnonce;
+  __syncthreads();
+  /*exit as fast as posible if one block has finished with solution*/
+  if(canExit!=0) return;
+
 
   unsigned int work[64];
   unsigned int A,B,C,D,E,F,G,H;
 	const unsigned int myid=(blockIdx.x*blockDim.x+threadIdx.x);
 	const unsigned int nonce=in->m_nonce + (myid << bits);
 	unsigned int t1,t2;
-	unsigned int bestnonce=0;
+	//unsigned int bestnonce=0;
+	
+
 	//unsigned int bestg=~0;
 	
 	unsigned int* in_m_AH = in->m_AH;
@@ -40,46 +51,42 @@
   unsigned int in_m_nbits = in->m_nbits;
 	
 	// the first 3 rounds we can do outside the loop because they depend on work[0] through work[2] which won't change
-	unsigned int A1,B1,C1,D1,E1,F1,G1,H1;
-	A1=in_m_AH[0];
-	B1=in_m_AH[1];
-	C1=in_m_AH[2];
-	D1=in_m_AH[3];
-	E1=in_m_AH[4];
-	F1=in_m_AH[5];
-	G1=in_m_AH[6];
-	H1=in_m_AH[7];
-	sharound(A1,B1,C1,D1,E1,F1,G1,H1,in_m_merkle,0x428A2F98);
-	sharound(H1,A1,B1,C1,D1,E1,F1,G1,in_m_ntime,0x71374491);
-	sharound(G1,H1,A1,B1,C1,D1,E1,F1,in_m_nbits,0xB5C0FBCF);
-    
-    #pragma unroll 1
-	for(unsigned int it=0; it<loops; it++)
+	/* move old A1, ... H1 to shared to solve registers
+	 * can also calculated on host and give to kernel, because its se same for all threads and blocks 
+	 */
+	__shared__ unsigned int AH[8];
+	__shared__ unsigned int AH2[8]; //cache for second round
+	if(threadIdx.x<8)
+	{
+	  AH2[threadIdx.x]=AH[threadIdx.x]=in_m_AH[threadIdx.x];
+	}
+	__syncthreads();
+	if(threadIdx.x==0)
+	{
+	sharound(AH[0],AH[1],AH[2],AH[3],AH[4],AH[5],AH[6],AH[7],in_m_merkle,0x428A2F98);
+	sharound(AH[7],AH[0],AH[1],AH[2],AH[3],AH[4],AH[5],AH[6],in_m_ntime,0x71374491);
+	sharound(AH[6],AH[7],AH[0],AH[1],AH[2],AH[3],AH[4],AH[5],in_m_nbits,0xB5C0FBCF);
+	}
+	__syncthreads();
+
+        #pragma unroll 1
+	for(unsigned int it=0; it<loops; ++it)
 	{
-		/*
-		A=in_m_AH[0];
-		B=in_m_AH[1];
-		C=in_m_AH[2];
-		D=in_m_AH[3];
-		E=in_m_AH[4];
-		F=in_m_AH[5];
-		G=in_m_AH[6];
-		H=in_m_AH[7];
-		*/
-		A=A1;
-		B=B1;
-		C=C1;
-		D=D1;
-		E=E1;
-		F=F1;
-		G=G1;
-		H=H1;
+		if(out[0].m_bestnonce!=0) return;
 		
+		A=AH[0];
+		B=AH[1];
+		C=AH[2];
+		D=AH[3];
+		E=AH[4];
+		F=AH[5];
+		G=AH[6];
+		H=AH[7];
 		work[0]=in_m_merkle;
 		work[1]=in_m_ntime;
 		work[2]=in_m_nbits;
 		//work[3]=byteswap(nonce+it);
-		work[3]=nonce + it;
+		work[3]=nonce +it;
 		work[4]=0x80000000;
 		work[5]=0x00000000;
 		work[6]=0x00000000;
@@ -160,14 +167,14 @@
 		
 		// hash the hash now
 		
-		work[0]=in_m_AH[0]+A;
-		work[1]=in_m_AH[1]+B;
-		work[2]=in_m_AH[2]+C;
-		work[3]=in_m_AH[3]+D;
-		work[4]=in_m_AH[4]+E;
-		work[5]=in_m_AH[5]+F;
-		work[6]=in_m_AH[6]+G;
-		work[7]=in_m_AH[7]+H;
+		work[0]=AH2[0]+A;
+		work[1]=AH2[1]+B;
+		work[2]=AH2[2]+C;
+		work[3]=AH2[3]+D;
+		work[4]=AH2[4]+E;
+		work[5]=AH2[5]+F;
+		work[6]=AH2[6]+G;
+		work[7]=AH2[7]+H;
 		work[8]=0x80000000;
 		work[9]=0x00000000;
 		work[10]=0x00000000;
@@ -258,13 +265,15 @@
 
 		if((H==0))// && (G<=bestg))
 		{
-			bestnonce=nonce+it;
+			//bestnonce=nonce+it;
+			atomicExch(&(out[0].m_bestnonce),nonce+it); /*we only need one solution*/
+			
 			//bestg=G;
 		}
 
    }
     
-    out[myid].m_bestnonce=bestnonce;
+    //out[myid].m_bestnonce=bestnonce;
     //out[myid].m_bestg=bestg;
 
 }