Sunday, 18 August 2013

CUDA "pointer is not to start of memory block"

CUDA "pointer is not to start of memory block"

I'm trying to set up some scientific calculations on CUDA. My code is
originally written if Fortran, so I taken piece of the interface block
from FORTCuda project interface http://sourceforge.net/projects/fortcuda/
to call CUDA C functions:
module cuda_runtime
use, intrinsic :: ISO_C_BINDING
use cuda_unknowns
enum, bind(C) !:: cudaError
enumerator :: cudaSuccess=0
enumerator :: cudaErrorMissingConfiguration=1
enumerator :: cudaErrorMemoryAllocation=2
enumerator :: cudaErrorInitializationError=3
enumerator :: cudaErrorLaunchFailure=4
enumerator :: cudaErrorPriorLaunchFailure=5
enumerator :: cudaErrorLaunchTimeout=6
enumerator :: cudaErrorLaunchOutOfResources=7
enumerator :: cudaErrorInvalidDeviceFunction=8
enumerator :: cudaErrorInvalidConfiguration=9
enumerator :: cudaErrorInvalidDevice=10
enumerator :: cudaErrorInvalidValue=11
enumerator :: cudaErrorInvalidPitchValue=12
enumerator :: cudaErrorInvalidSymbol=13
enumerator :: cudaErrorMapBufferObjectFailed=14
enumerator :: cudaErrorUnmapBufferObjectFailed=15
enumerator :: cudaErrorInvalidHostPointer=16
enumerator :: cudaErrorInvalidDevicePointer=17
enumerator :: cudaErrorInvalidTexture=18
enumerator :: cudaErrorInvalidTextureBinding=19
enumerator :: cudaErrorInvalidChannelDescriptor=20
enumerator :: cudaErrorInvalidMemcpyDirection=21
enumerator :: cudaErrorAddressOfConstant=22
enumerator :: cudaErrorTextureFetchFailed=23
enumerator :: cudaErrorTextureNotBound=24
enumerator :: cudaErrorSynchronizationError=25
enumerator :: cudaErrorInvalidFilterSetting=26
enumerator :: cudaErrorInvalidNormSetting=27
enumerator :: cudaErrorMixedDeviceExecution=28
enumerator :: cudaErrorCudartUnloading=29
enumerator :: cudaErrorUnknown=30
enumerator :: cudaErrorNotYetImplemented=31
enumerator :: cudaErrorMemoryValueTooLarge=32
enumerator :: cudaErrorInvalidResourceHandle=33
enumerator :: cudaErrorNotReady=34
enumerator :: cudaErrorInsufficientDriver=35
enumerator :: cudaErrorSetOnActiveProcess=36
enumerator :: cudaErrorInvalidSurface=37
enumerator :: cudaErrorNoDevice=38
enumerator :: cudaErrorECCUncorrectable=39
enumerator :: cudaErrorSharedObjectSymbolNotFound=40
enumerator :: cudaErrorSharedObjectInitFailed=41
enumerator :: cudaErrorUnsupportedLimit=42
enumerator :: cudaErrorDuplicateVariableName=43
enumerator :: cudaErrorDuplicateTextureName=44
enumerator :: cudaErrorDuplicateSurfaceName=45
enumerator :: cudaErrorDevicesUnavailable=46
enumerator :: cudaErrorStartupFailure=127
enumerator :: cudaErrorApiFailureBase=10000
end enum ! cudaError
enum, bind(C) !:: cudaMemcpyKind
enumerator :: cudaMemcpyHostToHost=0
enumerator :: cudaMemcpyHostToDevice=1
enumerator :: cudaMemcpyDeviceToHost=2
enumerator :: cudaMemcpyDeviceToDevice=3
end enum ! cudaMemcpyKind
interface ! [['cudaError_t', None], 'cudaMemcpy', [['void', '*',
'dst'], ['const', 'void', '*', 'src'], ['size_t', None, 'count'],
['enum', 'cudaMemcpyKind', None, 'kind']]]
function cudaMemcpy(dst,src,count,kind_arg) result( res ) bind(C,
name="cudaMemcpy")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
import cudaMemcpyHostToHost
implicit none
type(c_ptr), value :: dst
type(c_ptr), value :: src
integer(c_int), value :: count
integer (KIND(cudaMemcpyHostToHost)), value :: kind_arg
integer (KIND(cudaSuccess)) :: res
end function cudaMemcpy
end interface
interface ! [['cudaError_t', None], 'cudaMalloc', [['void', '**',
'devPtr'], ['size_t', None, 'size']]]
function cudaMalloc(devPtr,size) result( res ) bind(C,
name="cudaMalloc")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
implicit none
type(c_ptr) :: devPtr
integer(c_int), value :: size
integer (KIND(cudaSuccess)) :: res
end function cudaMalloc
end interface
interface ! [['cudaError_t', None], 'cudaFree', [['void', '*',
'devPtr']]]
function cudaFree(devPtr) result( res ) bind(C, name="cudaFree")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
implicit none
type(c_ptr), value :: devPtr
integer (KIND(cudaSuccess)) :: res
end function cudaFree
end interface
interface ! [['cudaError_t', None], 'cudaSetDevice', [['int', None,
'device']]]
function cudaSetDevice(device) result( res ) bind(C,
name="cudaSetDevice")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
implicit none
integer(c_int), value :: device
integer (KIND(cudaSuccess)) :: res
end function cudaSetDevice
end interface
end module
For calling CUDA C procedures and calculation kernel I use this subroutine
in Fortran:
SUBROUTINE CUDA_SUB(cut_X, cut_size, KL, constant1,constant2,constant3,
index1, index2, variant)
use, intrinsic :: ISO_C_BINDING
use cuda_runtime
integer, parameter :: fp_kind = kind(0.0d0) ! Double precision
integer cut_size
real(fp_kind), dimension(cut_size), target :: cut_X
integer(c_int) :: KL, index1, index2, variant
real(fp_kind) :: constant1,constant2,constant3
type (c_ptr) :: cptr_X
real(fp_kind), allocatable, target :: d_X(:)
real(fp_kind) ,pointer, dimension (:) :: fptr_X
integer(c_int) :: buf_size, cells_num
integer :: err
cells_num = cut_size/KL + 1
allocate(d_X(cut_size))
buf_size = (cut_size+KL)*fp_kind
err = cudaSetDevice(0)
err = cudaMalloc(cptr_X, buf_size)
call c_f_pointer(cptr_X,fptr_X,(/N/))
err = cudaMemcpy(cptr_X, c_loc(cut_X), buf_size, cudaMemCpyHostToDevice)
call cudacalc(fptr_X, cells_num,
KL,index1,index2,constant1,constant2,constant3,variant)
err = cudaMemcpy(c_loc(cut_X), cptr_X, buf_size, cudaMemCpyDeviceToHost)
err = cudaFree(cptr_X)
END
And for calculations this kind of kernel:
#include "stdio.h"
__global__ void Estep1_3(double *X, int Cells, int KL, int index1, int
index2,const double constant1, const double constant2, const double
constant3)
{
int index = threadIdx.x+blockDim.x*blockIdx.x;
int IJ = (index+1)*KL ;
double SP1,SP2,SU1,SU2,S1,S2 ;
if(index<Cells-2) {
SP1=double(0.5)*(X[IJ+6+KL]+X[IJ+6])+X[IJ+10] ;
SP2=double(0.5)*(X[IJ+6-KL]+X[IJ+6])+X[IJ+10-KL] ;
SU1=double(0.5)*(X[IJ+8+KL]+X[IJ+8]) ;
SU2=double(0.5)*(X[IJ+8-KL]+X[IJ+8]) ;
S1=X[IJ+11]-constant1*(SP1*SU1-SP2*SU2)/X[IJ+5] ;
S2=constant1*(X[IJ+index1]*SU1-X[IJ+index1-KL]*SU2)/X[IJ+5] ;
X[IJ+12]=S1+S2 ;
}
}
extern "C" void cudacalc_(double *a, int* N1, int* N2, int* N3, int* N4,
double* N5, double* N6, double* N7, int* N8)
{
int Cells = *N1;
int KL = *N2;
int index1 = *N3;
int index2 = *N4;
double constant1 = *N5;
double constant2 = *N6;
double constant3 = *N7;
int variant = *N8;
switch(variant)
{
case 1:
Estep1_3<<<1,Cells>>>(a,Cells,KL,index1,index2,constant1,constant2,constant3);
break ;
}
}
Then I'm trying to debug it with Allinea DDT tool I have a memory error
"pointer is not to start of memory block" then trying to deallocate memory
for cptr_X (err = cudaFree(cptr_X)). For debugging I use Acer laptop with
gpuocelot emulator and sm_20 architecture specification flag while
compiling. Maybe you can help me with this deallocation issue ? I'm not
familiar with CUDA development, and this program is quite straight forward
implementation of that I normally do in Fortran loops.
Thank you in advance for your help !

No comments:

Post a Comment