blob: dd2be9426534529406344c7a6d07fff01e52fe4d [file] [log] [blame] [raw]
//
// Generated by LLVM NVPTX Back-End
//
.version 6.1
.target sm_35
.address_size 64
// .globl cudaMalloc
.global .align 1 .b8 blockIdx[1];
.global .align 1 .b8 blockDim[1];
.global .align 1 .b8 threadIdx[1];
.visible .func (.param .b32 func_retval0) cudaMalloc(
.param .b64 cudaMalloc_param_0,
.param .b64 cudaMalloc_param_1
)
{
.local .align 8 .b8 __local_depot0[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<2>;
.reg .b64 %rd<3>;
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd2, [cudaMalloc_param_1];
ld.param.u64 %rd1, [cudaMalloc_param_0];
st.u64 [%SP+0], %rd1;
st.u64 [%SP+8], %rd2;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
ret;
}
// .globl cudaFuncGetAttributes
.visible .func (.param .b32 func_retval0) cudaFuncGetAttributes(
.param .b64 cudaFuncGetAttributes_param_0,
.param .b64 cudaFuncGetAttributes_param_1
)
{
.local .align 8 .b8 __local_depot1[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<2>;
.reg .b64 %rd<3>;
mov.u64 %SPL, __local_depot1;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd2, [cudaFuncGetAttributes_param_1];
ld.param.u64 %rd1, [cudaFuncGetAttributes_param_0];
st.u64 [%SP+0], %rd1;
st.u64 [%SP+8], %rd2;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
ret;
}
// .globl cudaDeviceGetAttribute
.visible .func (.param .b32 func_retval0) cudaDeviceGetAttribute(
.param .b64 cudaDeviceGetAttribute_param_0,
.param .b32 cudaDeviceGetAttribute_param_1,
.param .b32 cudaDeviceGetAttribute_param_2
)
{
.local .align 8 .b8 __local_depot2[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<4>;
.reg .b64 %rd<2>;
mov.u64 %SPL, __local_depot2;
cvta.local.u64 %SP, %SPL;
ld.param.u32 %r2, [cudaDeviceGetAttribute_param_2];
ld.param.u32 %r1, [cudaDeviceGetAttribute_param_1];
ld.param.u64 %rd1, [cudaDeviceGetAttribute_param_0];
st.u64 [%SP+0], %rd1;
st.u32 [%SP+8], %r1;
st.u32 [%SP+12], %r2;
mov.u32 %r3, 30;
st.param.b32 [func_retval0+0], %r3;
ret;
}
// .globl cudaGetDevice
.visible .func (.param .b32 func_retval0) cudaGetDevice(
.param .b64 cudaGetDevice_param_0
)
{
.local .align 8 .b8 __local_depot3[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<2>;
.reg .b64 %rd<2>;
mov.u64 %SPL, __local_depot3;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [cudaGetDevice_param_0];
st.u64 [%SP+0], %rd1;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
ret;
}
// .globl cudaOccupancyMaxActiveBlocksPerMultiprocessor
.visible .func (.param .b32 func_retval0) cudaOccupancyMaxActiveBlocksPerMultiprocessor(
.param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_0,
.param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_1,
.param .b32 cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_2,
.param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_3
)
{
.local .align 8 .b8 __local_depot4[32];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<3>;
.reg .b64 %rd<4>;
mov.u64 %SPL, __local_depot4;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd3, [cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_3];
ld.param.u32 %r1, [cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_2];
ld.param.u64 %rd2, [cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_1];
ld.param.u64 %rd1, [cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_0];
st.u64 [%SP+0], %rd1;
st.u64 [%SP+8], %rd2;
st.u32 [%SP+16], %r1;
st.u64 [%SP+24], %rd3;
mov.u32 %r2, 30;
st.param.b32 [func_retval0+0], %r2;
ret;
}
// .globl cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
.visible .func (.param .b32 func_retval0) cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
.param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_0,
.param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_1,
.param .b32 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_2,
.param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_3,
.param .b32 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_4
)
{
.local .align 8 .b8 __local_depot5[40];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<4>;
.reg .b64 %rd<4>;
mov.u64 %SPL, __local_depot5;
cvta.local.u64 %SP, %SPL;
ld.param.u32 %r2, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_4];
ld.param.u64 %rd3, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_3];
ld.param.u32 %r1, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_2];
ld.param.u64 %rd2, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_1];
ld.param.u64 %rd1, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_0];
st.u64 [%SP+0], %rd1;
st.u64 [%SP+8], %rd2;
st.u32 [%SP+16], %r1;
st.u64 [%SP+24], %rd3;
st.u32 [%SP+32], %r2;
mov.u32 %r3, 30;
st.param.b32 [func_retval0+0], %r3;
ret;
}
// .globl _Z6vecAddPfS_S_i
.visible .entry _Z6vecAddPfS_S_i(
.param .u64 _Z6vecAddPfS_S_i_param_0,
.param .u64 _Z6vecAddPfS_S_i_param_1,
.param .u64 _Z6vecAddPfS_S_i_param_2,
.param .u32 _Z6vecAddPfS_S_i_param_3
)
{
.local .align 8 .b8 __local_depot6[32];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<9>;
.reg .b64 %rd<18>;
mov.u64 %SPL, __local_depot6;
cvta.local.u64 %SP, %SPL;
ld.param.u32 %r1, [_Z6vecAddPfS_S_i_param_3];
ld.param.u64 %rd3, [_Z6vecAddPfS_S_i_param_2];
ld.param.u64 %rd2, [_Z6vecAddPfS_S_i_param_1];
ld.param.u64 %rd1, [_Z6vecAddPfS_S_i_param_0];
cvta.to.global.u64 %rd4, %rd3;
cvta.global.u64 %rd5, %rd4;
cvta.to.global.u64 %rd6, %rd2;
cvta.global.u64 %rd7, %rd6;
cvta.to.global.u64 %rd8, %rd1;
cvta.global.u64 %rd9, %rd8;
st.u64 [%SP+0], %rd9;
st.u64 [%SP+8], %rd7;
st.u64 [%SP+16], %rd5;
st.u32 [%SP+24], %r1;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %ntid.x;
mul.lo.s32 %r4, %r2, %r3;
mov.u32 %r5, %tid.x;
add.s32 %r6, %r4, %r5;
st.u32 [%SP+28], %r6;
ld.u32 %r7, [%SP+28];
ld.u32 %r8, [%SP+24];
setp.ge.s32 %p1, %r7, %r8;
@%p1 bra LBB6_2;
bra.uni LBB6_1;
LBB6_1:
ld.u64 %rd10, [%SP+0];
ld.s32 %rd11, [%SP+28];
shl.b64 %rd12, %rd11, 2;
add.s64 %rd13, %rd10, %rd12;
ld.f32 %f1, [%rd13];
ld.u64 %rd14, [%SP+8];
add.s64 %rd15, %rd14, %rd12;
ld.f32 %f2, [%rd15];
add.rn.f32 %f3, %f1, %f2;
ld.u64 %rd16, [%SP+16];
add.s64 %rd17, %rd16, %rd12;
st.f32 [%rd17], %f3;
bra.uni LBB6_2;
LBB6_2:
ret;
}