CUDA "pointer is not to start of memory block"
I'm trying to set up some scientific calculations on CUDA. My code is
originally written if Fortran, so I taken piece of the interface block
from FORTCuda project interface http://sourceforge.net/projects/fortcuda/
to call CUDA C functions:
module cuda_runtime
use, intrinsic :: ISO_C_BINDING
use cuda_unknowns
enum, bind(C) !:: cudaError
enumerator :: cudaSuccess=0
enumerator :: cudaErrorMissingConfiguration=1
enumerator :: cudaErrorMemoryAllocation=2
enumerator :: cudaErrorInitializationError=3
enumerator :: cudaErrorLaunchFailure=4
enumerator :: cudaErrorPriorLaunchFailure=5
enumerator :: cudaErrorLaunchTimeout=6
enumerator :: cudaErrorLaunchOutOfResources=7
enumerator :: cudaErrorInvalidDeviceFunction=8
enumerator :: cudaErrorInvalidConfiguration=9
enumerator :: cudaErrorInvalidDevice=10
enumerator :: cudaErrorInvalidValue=11
enumerator :: cudaErrorInvalidPitchValue=12
enumerator :: cudaErrorInvalidSymbol=13
enumerator :: cudaErrorMapBufferObjectFailed=14
enumerator :: cudaErrorUnmapBufferObjectFailed=15
enumerator :: cudaErrorInvalidHostPointer=16
enumerator :: cudaErrorInvalidDevicePointer=17
enumerator :: cudaErrorInvalidTexture=18
enumerator :: cudaErrorInvalidTextureBinding=19
enumerator :: cudaErrorInvalidChannelDescriptor=20
enumerator :: cudaErrorInvalidMemcpyDirection=21
enumerator :: cudaErrorAddressOfConstant=22
enumerator :: cudaErrorTextureFetchFailed=23
enumerator :: cudaErrorTextureNotBound=24
enumerator :: cudaErrorSynchronizationError=25
enumerator :: cudaErrorInvalidFilterSetting=26
enumerator :: cudaErrorInvalidNormSetting=27
enumerator :: cudaErrorMixedDeviceExecution=28
enumerator :: cudaErrorCudartUnloading=29
enumerator :: cudaErrorUnknown=30
enumerator :: cudaErrorNotYetImplemented=31
enumerator :: cudaErrorMemoryValueTooLarge=32
enumerator :: cudaErrorInvalidResourceHandle=33
enumerator :: cudaErrorNotReady=34
enumerator :: cudaErrorInsufficientDriver=35
enumerator :: cudaErrorSetOnActiveProcess=36
enumerator :: cudaErrorInvalidSurface=37
enumerator :: cudaErrorNoDevice=38
enumerator :: cudaErrorECCUncorrectable=39
enumerator :: cudaErrorSharedObjectSymbolNotFound=40
enumerator :: cudaErrorSharedObjectInitFailed=41
enumerator :: cudaErrorUnsupportedLimit=42
enumerator :: cudaErrorDuplicateVariableName=43
enumerator :: cudaErrorDuplicateTextureName=44
enumerator :: cudaErrorDuplicateSurfaceName=45
enumerator :: cudaErrorDevicesUnavailable=46
enumerator :: cudaErrorStartupFailure=127
enumerator :: cudaErrorApiFailureBase=10000
end enum ! cudaError
enum, bind(C) !:: cudaMemcpyKind
enumerator :: cudaMemcpyHostToHost=0
enumerator :: cudaMemcpyHostToDevice=1
enumerator :: cudaMemcpyDeviceToHost=2
enumerator :: cudaMemcpyDeviceToDevice=3
end enum ! cudaMemcpyKind
interface ! [['cudaError_t', None], 'cudaMemcpy', [['void', '*',
'dst'], ['const', 'void', '*', 'src'], ['size_t', None, 'count'],
['enum', 'cudaMemcpyKind', None, 'kind']]]
function cudaMemcpy(dst,src,count,kind_arg) result( res ) bind(C,
name="cudaMemcpy")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
import cudaMemcpyHostToHost
implicit none
type(c_ptr), value :: dst
type(c_ptr), value :: src
integer(c_int), value :: count
integer (KIND(cudaMemcpyHostToHost)), value :: kind_arg
integer (KIND(cudaSuccess)) :: res
end function cudaMemcpy
end interface
interface ! [['cudaError_t', None], 'cudaMalloc', [['void', '**',
'devPtr'], ['size_t', None, 'size']]]
function cudaMalloc(devPtr,size) result( res ) bind(C,
name="cudaMalloc")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
implicit none
type(c_ptr) :: devPtr
integer(c_int), value :: size
integer (KIND(cudaSuccess)) :: res
end function cudaMalloc
end interface
interface ! [['cudaError_t', None], 'cudaFree', [['void', '*',
'devPtr']]]
function cudaFree(devPtr) result( res ) bind(C, name="cudaFree")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
implicit none
type(c_ptr), value :: devPtr
integer (KIND(cudaSuccess)) :: res
end function cudaFree
end interface
interface ! [['cudaError_t', None], 'cudaSetDevice', [['int', None,
'device']]]
function cudaSetDevice(device) result( res ) bind(C,
name="cudaSetDevice")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
implicit none
integer(c_int), value :: device
integer (KIND(cudaSuccess)) :: res
end function cudaSetDevice
end interface
end module
For calling CUDA C procedures and calculation kernel I use this subroutine
in Fortran:
SUBROUTINE CUDA_SUB(cut_X, cut_size, KL, constant1,constant2,constant3,
index1, index2, variant)
use, intrinsic :: ISO_C_BINDING
use cuda_runtime
integer, parameter :: fp_kind = kind(0.0d0) ! Double precision
integer cut_size
real(fp_kind), dimension(cut_size), target :: cut_X
integer(c_int) :: KL, index1, index2, variant
real(fp_kind) :: constant1,constant2,constant3
type (c_ptr) :: cptr_X
real(fp_kind), allocatable, target :: d_X(:)
real(fp_kind) ,pointer, dimension (:) :: fptr_X
integer(c_int) :: buf_size, cells_num
integer :: err
cells_num = cut_size/KL + 1
allocate(d_X(cut_size))
buf_size = (cut_size+KL)*fp_kind
err = cudaSetDevice(0)
err = cudaMalloc(cptr_X, buf_size)
call c_f_pointer(cptr_X,fptr_X,(/N/))
err = cudaMemcpy(cptr_X, c_loc(cut_X), buf_size, cudaMemCpyHostToDevice)
call cudacalc(fptr_X, cells_num,
KL,index1,index2,constant1,constant2,constant3,variant)
err = cudaMemcpy(c_loc(cut_X), cptr_X, buf_size, cudaMemCpyDeviceToHost)
err = cudaFree(cptr_X)
END
And for calculations this kind of kernel:
#include "stdio.h"
__global__ void Estep1_3(double *X, int Cells, int KL, int index1, int
index2,const double constant1, const double constant2, const double
constant3)
{
int index = threadIdx.x+blockDim.x*blockIdx.x;
int IJ = (index+1)*KL ;
double SP1,SP2,SU1,SU2,S1,S2 ;
if(index<Cells-2) {
SP1=double(0.5)*(X[IJ+6+KL]+X[IJ+6])+X[IJ+10] ;
SP2=double(0.5)*(X[IJ+6-KL]+X[IJ+6])+X[IJ+10-KL] ;
SU1=double(0.5)*(X[IJ+8+KL]+X[IJ+8]) ;
SU2=double(0.5)*(X[IJ+8-KL]+X[IJ+8]) ;
S1=X[IJ+11]-constant1*(SP1*SU1-SP2*SU2)/X[IJ+5] ;
S2=constant1*(X[IJ+index1]*SU1-X[IJ+index1-KL]*SU2)/X[IJ+5] ;
X[IJ+12]=S1+S2 ;
}
}
extern "C" void cudacalc_(double *a, int* N1, int* N2, int* N3, int* N4,
double* N5, double* N6, double* N7, int* N8)
{
int Cells = *N1;
int KL = *N2;
int index1 = *N3;
int index2 = *N4;
double constant1 = *N5;
double constant2 = *N6;
double constant3 = *N7;
int variant = *N8;
switch(variant)
{
case 1:
Estep1_3<<<1,Cells>>>(a,Cells,KL,index1,index2,constant1,constant2,constant3);
break ;
}
}
Then I'm trying to debug it with Allinea DDT tool I have a memory error
"pointer is not to start of memory block" then trying to deallocate memory
for cptr_X (err = cudaFree(cptr_X)). For debugging I use Acer laptop with
gpuocelot emulator and sm_20 architecture specification flag while
compiling. Maybe you can help me with this deallocation issue ? I'm not
familiar with CUDA development, and this program is quite straight forward
implementation of that I normally do in Fortran loops.
Thank you in advance for your help !
No comments:
Post a Comment