psychocoder (OP)
Newbie
Offline
Activity: 49
Merit: 0
|
|
April 01, 2013, 10:08:44 AM Last edit: July 30, 2013, 12:59:20 PM by psychocoder |
|
Hi, Summary of informations from this thread:Reposetory with my changes for rpcminer-mod (only Cuda): https://github.com/psychocoderHPC/rpcminer-modReposetory with all changes and windows support (CUDA und OpenCL) (administrated by charliemaggot): https://github.com/cdmackie/rpcminer-modKnown CUDA Errors unter Windows: - cudart32_50.dll or cudart32_50.dll is missing -> install https://developer.nvidia.com/cuda-downloads to solve this problem - bitcoinminercuda.cpp:174 crash -> this means that the kernel run longer than windows allow, to solv this add the paramter -gpugrid 256 or other number to the parameters Original Post:I have changed the code of rpcminer-mod https://github.com/Ang3lus/rpcminer-mod a little bit thus we get better performance on Kepler GPUs. First, since cuda 5.0 we have a rotated function inside of the ptx (parallel asm), we must not add this by hand because the compiler find it automaticly. Example: (((x ) >> bits) | (x << (32 - bits))) is compiled to ptx command shf.l.wrap.b32Note: The changes are not comatible with the opencl version, I only change the cuda source. In CMAKEList.txt is hard coded that sm_35 (Kepler code) is created. Run new code with this parameter: -gpu=0 -aggression=8 -gpugrid=2048 -gputhreads=256 To install the patch in your code goto root of the project and run $ patch -p1 < patch.txt Now you get over 300MHash/s out of a Kepler GPU, I think that GTX Kepler GPUs are faster than K20 HPC version. It can be that you must use sm_30 for GTX Kepler GPUs. [EDIT:] for 330MHash/s the GPU needs 138 Watt power. psychocoder patch.txt diff -Naur ./cmake-rpcminer/CMakeLists.txt ../../rpcminer-cuda_svn//cmake-rpcminer/CMakeLists.txt --- ./cmake-rpcminer/CMakeLists.txt 2013-01-28 19:27:46.000000000 +0100 +++ ../../rpcminer-cuda_svn//cmake-rpcminer/CMakeLists.txt 2013-04-01 11:22:19.000000000 +0200 @@ -32,6 +32,10 @@ IF(BITCOIN_ENABLE_CUDA) ADD_DEFINITIONS(-D_BITCOIN_MINER_CUDA_) + #SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_35,code=sm_35) + SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_35,code=sm_35 -Xptxas=-v -Xopencc=-LIST:source=on) + #SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_13,code=sm_13 -Xptxas=-v -Xopencc=-LIST:source=on) + CUDA_ADD_EXECUTABLE(rpcminer ${BITCOIN_RPC_MINER_SRC} ${BITCOIN_RPC_MINER_CUDA_SRC}) # Install generated PTX CUDA module INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/${generated_file_basename}.ptx" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/rpcminer-cuda" RENAME "bitcoinminercuda.ptx") diff -Naur ./patch.txt ../../rpcminer-cuda_svn//patch.txt --- ./patch.txt 2013-04-01 11:30:47.000000000 +0200 +++ ../../rpcminer-cuda_svn//patch.txt 1970-01-01 01:00:00.000000000 +0100 @@ -1,14 +0,0 @@ -diff -Naur ./cmake-rpcminer/CMakeLists.txt ../../rpcminer-cuda_svn//cmake-rpcminer/CMakeLists.txt ---- ./cmake-rpcminer/CMakeLists.txt 2013-01-28 19:27:46.000000000 +0100 -+++ ../../rpcminer-cuda_svn//cmake-rpcminer/CMakeLists.txt 2013-04-01 11:22:19.000000000 +0200 -@@ -32,6 +32,10 @@ - - IF(BITCOIN_ENABLE_CUDA) - ADD_DEFINITIONS(-D_BITCOIN_MINER_CUDA_) -+ #SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_35,code=sm_35) -+ SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_35,code=sm_35 -Xptxas=-v -Xopencc=-LIST:source=on) -+ #SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_13,code=sm_13 -Xptxas=-v -Xopencc=-LIST:source=on) -+ - CUDA_ADD_EXECUTABLE(rpcminer ${BITCOIN_RPC_MINER_SRC} ${BITCOIN_RPC_MINER_CUDA_SRC}) - # Install generated PTX CUDA module - INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/${generated_file_basename}.ptx" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/rpcminer-cuda" RENAME "bitcoinminercuda.ptx") diff -Naur ./src/cuda/bitcoinminercuda.cpp ../../rpcminer-cuda_svn//src/cuda/bitcoinminercuda.cpp --- ./src/cuda/bitcoinminercuda.cpp 2013-01-28 19:27:46.000000000 +0100 +++ ../../rpcminer-cuda_svn//src/cuda/bitcoinminercuda.cpp 2013-04-01 11:22:30.000000000 +0200 @@ -290,6 +290,8 @@ { AllocateResources(m_numb,m_numt); } + m_out[0].m_bestnonce=0; + cuMemcpyHtoD(m_devout,m_out,/*m_numb*m_numt*/sizeof(cuda_out)); cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in)); @@ -319,11 +321,11 @@ cuFuncSetBlockShape(m_function,m_numt,1,1); cuLaunchGrid(m_function,m_numb,1); - cuMemcpyDtoH(m_out,m_devout,m_numb*m_numt*sizeof(cuda_out)); + cuMemcpyDtoH(m_out,m_devout,/*m_numb*m_numt*/sizeof(cuda_out)); // very unlikely that we will find more than 1 hash with H=0 // so we'll just return the first one and not even worry about G - for(int i=0; i<m_numb*m_numt; i++) + for(int i=0; i<1/*m_numb*m_numt*/; i++) { if(m_out[i].m_bestnonce!=0)// && m_out[i].m_bestg<bestg) { diff -Naur ./src/cuda/bitcoinminercuda.cu ../../rpcminer-cuda_svn//src/cuda/bitcoinminercuda.cu --- ./src/cuda/bitcoinminercuda.cu 2013-01-28 19:27:46.000000000 +0100 +++ ../../rpcminer-cuda_svn//src/cuda/bitcoinminercuda.cu 2013-04-01 11:22:30.000000000 +0200 @@ -18,20 +18,31 @@ #include "cudashared.h" -#define byteswap(x) (((x>>24) & 0x000000ff) | ((x>>8) & 0x0000ff00) | ((x<<8) & 0x00ff0000) | ((x<<24) & 0xff000000)) -#define rotateright(x,bits) (((x & 0xffffffff) >> bits) | (x << (32 - bits))) -#define R(x) (work[x] = (rotateright(work[x-2],17)^rotateright(work[x-2],19)^((work[x-2]&0xffffffff)>>10)) + work[x - 7] + (rotateright(work[x-15],7)^rotateright(work[x-15],18)^((work[x-15]&0xffffffff)>>3)) + work[x - 16]) +#define rotateright(x,bits) (((x ) >> bits) | (x << (32 - bits))) + +#define R(x) (work[x] = (rotateright(work[x-2],17)^rotateright(work[x-2],19)^((work[x-2])>>10)) + work[x - 7] + (rotateright(work[x-15],7)^rotateright(work[x-15],18)^((work[x-15])>>3)) + work[x - 16]) + #define sharound(a,b,c,d,e,f,g,h,x,K) {t1=h+(rotateright(e,6)^rotateright(e,11)^rotateright(e,25))+(g^(e&(f^g)))+K+x; t2=(rotateright(a,2)^rotateright(a,13)^rotateright(a,22))+((a&b)|(c&(a|b))); d+=t1; h=t1+t2;} -extern "C" __global__ void cuda_process(cuda_in *in, cuda_out *out, const unsigned int loops, const unsigned int bits) +extern "C" __global__ void cuda_process(cuda_in __restrict__ *in, cuda_out __restrict__ *out, const unsigned int loops, const unsigned int bits) { + /*variable to check if any other block has a solution*/ + __shared__ unsigned int canExit; + if(threadIdx.x==0) + canExit=out[0].m_bestnonce; + __syncthreads(); + /*exit as fast as posible if one block has finished with solution*/ + if(canExit!=0) return; + unsigned int work[64]; unsigned int A,B,C,D,E,F,G,H; const unsigned int myid=(blockIdx.x*blockDim.x+threadIdx.x); const unsigned int nonce=in->m_nonce + (myid << bits); unsigned int t1,t2; - unsigned int bestnonce=0; + //unsigned int bestnonce=0; + + //unsigned int bestg=~0; unsigned int* in_m_AH = in->m_AH; @@ -40,46 +51,42 @@ unsigned int in_m_nbits = in->m_nbits; // the first 3 rounds we can do outside the loop because they depend on work[0] through work[2] which won't change - unsigned int A1,B1,C1,D1,E1,F1,G1,H1; - A1=in_m_AH[0]; - B1=in_m_AH[1]; - C1=in_m_AH[2]; - D1=in_m_AH[3]; - E1=in_m_AH[4]; - F1=in_m_AH[5]; - G1=in_m_AH[6]; - H1=in_m_AH[7]; - sharound(A1,B1,C1,D1,E1,F1,G1,H1,in_m_merkle,0x428A2F98); - sharound(H1,A1,B1,C1,D1,E1,F1,G1,in_m_ntime,0x71374491); - sharound(G1,H1,A1,B1,C1,D1,E1,F1,in_m_nbits,0xB5C0FBCF); - - #pragma unroll 1 - for(unsigned int it=0; it<loops; it++) + /* move old A1, ... H1 to shared to solve registers + * can also calculated on host and give to kernel, because its se same for all threads and blocks + */ + __shared__ unsigned int AH[8]; + __shared__ unsigned int AH2[8]; //cache for second round + if(threadIdx.x<8) + { + AH2[threadIdx.x]=AH[threadIdx.x]=in_m_AH[threadIdx.x]; + } + __syncthreads(); + if(threadIdx.x==0) + { + sharound(AH[0],AH[1],AH[2],AH[3],AH[4],AH[5],AH[6],AH[7],in_m_merkle,0x428A2F98); + sharound(AH[7],AH[0],AH[1],AH[2],AH[3],AH[4],AH[5],AH[6],in_m_ntime,0x71374491); + sharound(AH[6],AH[7],AH[0],AH[1],AH[2],AH[3],AH[4],AH[5],in_m_nbits,0xB5C0FBCF); + } + __syncthreads(); + + #pragma unroll 1 + for(unsigned int it=0; it<loops; ++it) { - /* - A=in_m_AH[0]; - B=in_m_AH[1]; - C=in_m_AH[2]; - D=in_m_AH[3]; - E=in_m_AH[4]; - F=in_m_AH[5]; - G=in_m_AH[6]; - H=in_m_AH[7]; - */ - A=A1; - B=B1; - C=C1; - D=D1; - E=E1; - F=F1; - G=G1; - H=H1; + if(out[0].m_bestnonce!=0) return; + A=AH[0]; + B=AH[1]; + C=AH[2]; + D=AH[3]; + E=AH[4]; + F=AH[5]; + G=AH[6]; + H=AH[7]; work[0]=in_m_merkle; work[1]=in_m_ntime; work[2]=in_m_nbits; //work[3]=byteswap(nonce+it); - work[3]=nonce + it; + work[3]=nonce +it; work[4]=0x80000000; work[5]=0x00000000; work[6]=0x00000000; @@ -160,14 +167,14 @@ // hash the hash now - work[0]=in_m_AH[0]+A; - work[1]=in_m_AH[1]+B; - work[2]=in_m_AH[2]+C; - work[3]=in_m_AH[3]+D; - work[4]=in_m_AH[4]+E; - work[5]=in_m_AH[5]+F; - work[6]=in_m_AH[6]+G; - work[7]=in_m_AH[7]+H; + work[0]=AH2[0]+A; + work[1]=AH2[1]+B; + work[2]=AH2[2]+C; + work[3]=AH2[3]+D; + work[4]=AH2[4]+E; + work[5]=AH2[5]+F; + work[6]=AH2[6]+G; + work[7]=AH2[7]+H; work[8]=0x80000000; work[9]=0x00000000; work[10]=0x00000000; @@ -258,13 +265,15 @@ if((H==0))// && (G<=bestg)) { - bestnonce=nonce+it; + //bestnonce=nonce+it; + atomicExch(&(out[0].m_bestnonce),nonce+it); /*we only need one solution*/ + //bestg=G; } } - out[myid].m_bestnonce=bestnonce; + //out[myid].m_bestnonce=bestnonce; //out[myid].m_bestg=bestg; }
|