Hi,
Summary of informations from this thread:Reposetory with my changes for rpcminermod (only Cuda):
https://github.com/psychocoderHPC/rpcminermodReposetory with all changes and windows support (CUDA und OpenCL) (administrated by charliemaggot):
https://github.com/cdmackie/rpcminermodKnown CUDA Errors unter Windows:
 cudart32_50.dll or cudart32_50.dll is missing > install
https://developer.nvidia.com/cudadownloads to solve this problem
 bitcoinminercuda.cpp:174 crash > this means that the kernel run longer than windows allow, to solv this add the paramter gpugrid 256 or other number to the parameters
Original Post:I have changed the code of rpcminermod
https://github.com/Ang3lus/rpcminermod a little bit thus we get better performance on Kepler GPUs.
First, since cuda 5.0 we have a rotated function inside of the ptx (parallel asm), we must not add this by hand because the compiler find it automaticly.
Example:
(((x ) >> bits)  (x << (32  bits))) is compiled to ptx command
shf.l.wrap.b32Note: The changes are not comatible with the opencl version, I only change the cuda source. In CMAKEList.txt is hard coded that sm_35 (Kepler code) is created.
Run new code with this parameter: gpu=0 aggression=8 gpugrid=2048 gputhreads=256
To install the patch in your code goto root of the project and run $ patch p1 < patch.txt
Now you get over 300MHash/s out of a Kepler GPU, I think that GTX Kepler GPUs are faster than K20 HPC version. It can be that you must use sm_30 for GTX Kepler GPUs.
[EDIT:] for 330MHash/s the GPU needs 138 Watt power.
psychocoder
patch.txt
diff Naur ./cmakerpcminer/CMakeLists.txt ../../rpcminercuda_svn//cmakerpcminer/CMakeLists.txt
 ./cmakerpcminer/CMakeLists.txt 20130128 19:27:46.000000000 +0100
+++ ../../rpcminercuda_svn//cmakerpcminer/CMakeLists.txt 20130401 11:22:19.000000000 +0200
@@ 32,6 +32,10 @@
IF(BITCOIN_ENABLE_CUDA)
ADD_DEFINITIONS(D_BITCOIN_MINER_CUDA_)
+ #SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} gencode=arch=compute_20,code=sm_20 gencode=arch=compute_35,code=sm_35)
+ SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} gencode=arch=compute_35,code=sm_35 Xptxas=v Xopencc=LIST:source=on)
+ #SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} gencode=arch=compute_13,code=sm_13 Xptxas=v Xopencc=LIST:source=on)
+
CUDA_ADD_EXECUTABLE(rpcminer ${BITCOIN_RPC_MINER_SRC} ${BITCOIN_RPC_MINER_CUDA_SRC})
# Install generated PTX CUDA module
INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/${generated_file_basename}.ptx" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/rpcminercuda" RENAME "bitcoinminercuda.ptx")
diff Naur ./patch.txt ../../rpcminercuda_svn//patch.txt
 ./patch.txt 20130401 11:30:47.000000000 +0200
+++ ../../rpcminercuda_svn//patch.txt 19700101 01:00:00.000000000 +0100
@@ 1,14 +0,0 @@
diff Naur ./cmakerpcminer/CMakeLists.txt ../../rpcminercuda_svn//cmakerpcminer/CMakeLists.txt
 ./cmakerpcminer/CMakeLists.txt 20130128 19:27:46.000000000 +0100
+++ ../../rpcminercuda_svn//cmakerpcminer/CMakeLists.txt 20130401 11:22:19.000000000 +0200
@@ 32,6 +32,10 @@

 IF(BITCOIN_ENABLE_CUDA)
 ADD_DEFINITIONS(D_BITCOIN_MINER_CUDA_)
+ #SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} gencode=arch=compute_20,code=sm_20 gencode=arch=compute_35,code=sm_35)
+ SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} gencode=arch=compute_35,code=sm_35 Xptxas=v Xopencc=LIST:source=on)
+ #SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} gencode=arch=compute_13,code=sm_13 Xptxas=v Xopencc=LIST:source=on)
+
 CUDA_ADD_EXECUTABLE(rpcminer ${BITCOIN_RPC_MINER_SRC} ${BITCOIN_RPC_MINER_CUDA_SRC})
 # Install generated PTX CUDA module
 INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/${generated_file_basename}.ptx" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/rpcminercuda" RENAME "bitcoinminercuda.ptx")
diff Naur ./src/cuda/bitcoinminercuda.cpp ../../rpcminercuda_svn//src/cuda/bitcoinminercuda.cpp
 ./src/cuda/bitcoinminercuda.cpp 20130128 19:27:46.000000000 +0100
+++ ../../rpcminercuda_svn//src/cuda/bitcoinminercuda.cpp 20130401 11:22:30.000000000 +0200
@@ 290,6 +290,8 @@
{
AllocateResources(m_numb,m_numt);
}
+ m_out[0].m_bestnonce=0;
+ cuMemcpyHtoD(m_devout,m_out,/*m_numb*m_numt*/sizeof(cuda_out));
cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in));
@@ 319,11 +321,11 @@
cuFuncSetBlockShape(m_function,m_numt,1,1);
cuLaunchGrid(m_function,m_numb,1);
 cuMemcpyDtoH(m_out,m_devout,m_numb*m_numt*sizeof(cuda_out));
+ cuMemcpyDtoH(m_out,m_devout,/*m_numb*m_numt*/sizeof(cuda_out));
// very unlikely that we will find more than 1 hash with H=0
// so we'll just return the first one and not even worry about G
 for(int i=0; i<m_numb*m_numt; i++)
+ for(int i=0; i<1/*m_numb*m_numt*/; i++)
{
if(m_out[i].m_bestnonce!=0)// && m_out[i].m_bestg<bestg)
{
diff Naur ./src/cuda/bitcoinminercuda.cu ../../rpcminercuda_svn//src/cuda/bitcoinminercuda.cu
 ./src/cuda/bitcoinminercuda.cu 20130128 19:27:46.000000000 +0100
+++ ../../rpcminercuda_svn//src/cuda/bitcoinminercuda.cu 20130401 11:22:30.000000000 +0200
@@ 18,20 +18,31 @@
#include "cudashared.h"
#define byteswap(x) (((x>>24) & 0x000000ff)  ((x>>8) & 0x0000ff00)  ((x<<8) & 0x00ff0000)  ((x<<24) & 0xff000000))
#define rotateright(x,bits) (((x & 0xffffffff) >> bits)  (x << (32  bits)))
#define R(x) (work[x] = (rotateright(work[x2],17)^rotateright(work[x2],19)^((work[x2]&0xffffffff)>>10)) + work[x  7] + (rotateright(work[x15],7)^rotateright(work[x15],18)^((work[x15]&0xffffffff)>>3)) + work[x  16])
+#define rotateright(x,bits) (((x ) >> bits)  (x << (32  bits)))
+
+#define R(x) (work[x] = (rotateright(work[x2],17)^rotateright(work[x2],19)^((work[x2])>>10)) + work[x  7] + (rotateright(work[x15],7)^rotateright(work[x15],18)^((work[x15])>>3)) + work[x  16])
+
#define sharound(a,b,c,d,e,f,g,h,x,K) {t1=h+(rotateright(e,6)^rotateright(e,11)^rotateright(e,25))+(g^(e&(f^g)))+K+x; t2=(rotateright(a,2)^rotateright(a,13)^rotateright(a,22))+((a&b)(c&(ab))); d+=t1; h=t1+t2;}
extern "C" __global__ void cuda_process(cuda_in *in, cuda_out *out, const unsigned int loops, const unsigned int bits)
+extern "C" __global__ void cuda_process(cuda_in __restrict__ *in, cuda_out __restrict__ *out, const unsigned int loops, const unsigned int bits)
{
+ /*variable to check if any other block has a solution*/
+ __shared__ unsigned int canExit;
+ if(threadIdx.x==0)
+ canExit=out[0].m_bestnonce;
+ __syncthreads();
+ /*exit as fast as posible if one block has finished with solution*/
+ if(canExit!=0) return;
+
unsigned int work[64];
unsigned int A,B,C,D,E,F,G,H;
const unsigned int myid=(blockIdx.x*blockDim.x+threadIdx.x);
const unsigned int nonce=in>m_nonce + (myid << bits);
unsigned int t1,t2;
 unsigned int bestnonce=0;
+ //unsigned int bestnonce=0;
+
+
//unsigned int bestg=~0;
unsigned int* in_m_AH = in>m_AH;
@@ 40,46 +51,42 @@
unsigned int in_m_nbits = in>m_nbits;
// the first 3 rounds we can do outside the loop because they depend on work[0] through work[2] which won't change
 unsigned int A1,B1,C1,D1,E1,F1,G1,H1;
 A1=in_m_AH[0];
 B1=in_m_AH[1];
 C1=in_m_AH[2];
 D1=in_m_AH[3];
 E1=in_m_AH[4];
 F1=in_m_AH[5];
 G1=in_m_AH[6];
 H1=in_m_AH[7];
 sharound(A1,B1,C1,D1,E1,F1,G1,H1,in_m_merkle,0x428A2F98);
 sharound(H1,A1,B1,C1,D1,E1,F1,G1,in_m_ntime,0x71374491);
 sharound(G1,H1,A1,B1,C1,D1,E1,F1,in_m_nbits,0xB5C0FBCF);

 #pragma unroll 1
 for(unsigned int it=0; it<loops; it++)
+ /* move old A1, ... H1 to shared to solve registers
+ * can also calculated on host and give to kernel, because its se same for all threads and blocks
+ */
+ __shared__ unsigned int AH[8];
+ __shared__ unsigned int AH2[8]; //cache for second round
+ if(threadIdx.x<8)
+ {
+ AH2[threadIdx.x]=AH[threadIdx.x]=in_m_AH[threadIdx.x];
+ }
+ __syncthreads();
+ if(threadIdx.x==0)
+ {
+ sharound(AH[0],AH[1],AH[2],AH[3],AH[4],AH[5],AH[6],AH[7],in_m_merkle,0x428A2F98);
+ sharound(AH[7],AH[0],AH[1],AH[2],AH[3],AH[4],AH[5],AH[6],in_m_ntime,0x71374491);
+ sharound(AH[6],AH[7],AH[0],AH[1],AH[2],AH[3],AH[4],AH[5],in_m_nbits,0xB5C0FBCF);
+ }
+ __syncthreads();
+
+ #pragma unroll 1
+ for(unsigned int it=0; it<loops; ++it)
{
 /*
 A=in_m_AH[0];
 B=in_m_AH[1];
 C=in_m_AH[2];
 D=in_m_AH[3];
 E=in_m_AH[4];
 F=in_m_AH[5];
 G=in_m_AH[6];
 H=in_m_AH[7];
 */
 A=A1;
 B=B1;
 C=C1;
 D=D1;
 E=E1;
 F=F1;
 G=G1;
 H=H1;
+ if(out[0].m_bestnonce!=0) return;
+ A=AH[0];
+ B=AH[1];
+ C=AH[2];
+ D=AH[3];
+ E=AH[4];
+ F=AH[5];
+ G=AH[6];
+ H=AH[7];
work[0]=in_m_merkle;
work[1]=in_m_ntime;
work[2]=in_m_nbits;
//work[3]=byteswap(nonce+it);
 work[3]=nonce + it;
+ work[3]=nonce +it;
work[4]=0x80000000;
work[5]=0x00000000;
work[6]=0x00000000;
@@ 160,14 +167,14 @@
// hash the hash now
 work[0]=in_m_AH[0]+A;
 work[1]=in_m_AH[1]+B;
 work[2]=in_m_AH[2]+C;
 work[3]=in_m_AH[3]+D;
 work[4]=in_m_AH[4]+E;
 work[5]=in_m_AH[5]+F;
 work[6]=in_m_AH[6]+G;
 work[7]=in_m_AH[7]+H;
+ work[0]=AH2[0]+A;
+ work[1]=AH2[1]+B;
+ work[2]=AH2[2]+C;
+ work[3]=AH2[3]+D;
+ work[4]=AH2[4]+E;
+ work[5]=AH2[5]+F;
+ work[6]=AH2[6]+G;
+ work[7]=AH2[7]+H;
work[8]=0x80000000;
work[9]=0x00000000;
work[10]=0x00000000;
@@ 258,13 +265,15 @@
if((H==0))// && (G<=bestg))
{
 bestnonce=nonce+it;
+ //bestnonce=nonce+it;
+ atomicExch(&(out[0].m_bestnonce),nonce+it); /*we only need one solution*/
+
//bestg=G;
}
}
 out[myid].m_bestnonce=bestnonce;
+ //out[myid].m_bestnonce=bestnonce;
//out[myid].m_bestg=bestg;
}