o
    j9:jR                     @   s  d dl Z d dlZd dlmZ d dlZz
d dlmZ dZW n e	y)   dZdZY nw d dl
mZ de jfdd	Zde jfd
dZde jfddZdeddfddZdedefddZde jfddZde jfddZde jfddZdee fddZ				d0dedededB dedB dedB d edeeef fd!d"ZG d#d$ d$ZG d%d& d&Z	d1d'eeB d(ee dB dee ed&f B fd)d*Z!	d2d+ed,ed-edefd.d/ZdS )3    N)Any)runtimeTF)_get_device_indexreturnc               	   C   s   zdd l } tt| dd }W n# ttfy5   tjdkr.tdt	j
jd  d}ntd}Y nw |j|_|j|_|j|_|j|_|j|_|S )Nr   amdhip64win32	amdhip64_.dllzlibamdhip64.so)rocm_sdkctypesCDLLstrfind_librariesImportError
IndexErrorsysplatformtorchversionhiphipGetErrorStringcuGetErrorStringhipModuleLoadDatacuModuleLoadDatahipModuleGetFunctioncuModuleGetFunctionhipModuleLaunchKernelcuLaunchKernelhipFuncSetAttributecuFuncSetAttribute)r
   lib r!   X/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/cuda/_utils.py_get_hip_runtime_library   s   

r#   c                   C   s   t jdkr
tdS tdS )Nr   z
nvcuda.dllzlibcuda.so.1)r   r   r   r   r!   r!   r!   r"   _get_cuda_library-   s   


r$   c                   C      t jjrt S t S N)r   r   r   r#   r$   r!   r!   r!   r"   _get_gpu_runtime_library5   s   r'   resultc                 C   sR   | dkrd S t  }t }|| t | |jd ur |j nd}td| )Nr   Unknown CUDA errorCUDA error: )r   c_char_pr'   r   byrefvaluedecodeRuntimeError)r(   err_strlibcudaerror_messager!   r!   r"   _check_cuda=   s   r3   c                 C   s~   t std| ^}}|tjjkr+t|\}}t|tr | }td| d| dt	|dkr3dS t	|dkr=|d S |S )a  Check a cuda.bindings (cuda-python) call result for errors.

    All cuda.bindings runtime calls return ``(error, *outputs)``.  This
    helper unpacks the tuple, raises on non-success, and returns the
    outputs (``None`` for zero outputs, scalar for one, tuple otherwise).
    zcuda.bindings is not availabler*   z ()r   N   )
_HAS_CUDA_BINDINGSr/   _cuda_bindings_runtimecudaError_tcudaSuccesscudaGetErrorString
isinstancebytesr.   len)r(   errout_r0   r!   r!   r"   _check_cuda_bindingsI   s$   
rA   c               
   C   s   zdd l } tt| dd }W n0 ttfyB   tjdkr;d	dt
jjd dt
jjd g}td| d}ntd}Y nw |j|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_ |S )	Nr   hiprtcr    0   r	   zlibhiprtc.so)!r
   r   r   r   r   r   r   r   r   joinr   r   r   hiprtcGetErrorStringnvrtcGetErrorStringhiprtcCreateProgramnvrtcCreateProgramhiprtcDestroyProgramnvrtcDestroyProgramhiprtcCompileProgramnvrtcCompileProgramhiprtcGetCodeSizenvrtcGetCUBINSizehiprtcGetCodenvrtcGetCUBINhiprtcGetProgramLogSizenvrtcGetProgramLogSizehiprtcGetProgramLognvrtcGetProgramLoghiprtcAddNameExpressionnvrtcAddNameExpressionhiprtcGetLoweredNamenvrtcGetLoweredName)r
   r    version_strr!   r!   r"   _get_hiprtc_libraryf   s.   


r\   c               	   C   sr   t tjjdd } tjdkrd|  dg}nd|  dg}|D ]}zt|W   S  t	y4   Y q!w t	d)	N.r   r   nvrtc64_z0_0.dllzlibnvrtc.so.zlibnvrtc.soz Could not find any NVRTC library)
intr   r   cudasplitr   r   r   r   OSError)major_version
nvrtc_libslib_namer!   r!   r"   _get_nvrtc_library   s   

rf   c                   C   r%   r&   )r   r   r   r\   rf   r!   r!   r!   r"   _get_gpu_rtc_library   s   rg   c                     s>   ddl m} m} dh  fdd|D }tjjr||  |S )z
    Get HIPCC/NVCC flags that are compatible with NVRTC compilation.

    Returns:
        List of HIPCC/NVCC flags that can be safely used with NVRTC.
    r   )COMMON_HIPCC_FLAGSCOMMON_NVCC_FLAGSz--expt-relaxed-constexprc                    s   g | ]}| vr|qS r!   r!   .0flagnvrtc_unsupported_flagsr!   r"   
<listcomp>   s    z1_get_gpu_rtc_compatible_flags.<locals>.<listcomp>)torch.utils.cpp_extensionrh   ri   r   r   r   extend)rh   ri   compatible_flagsr!   rm   r"   _get_gpu_rtc_compatible_flags   s   

rs   kernel_sourcekernel_namecompute_capabilitycuda_include_dirsnvcc_optionsauto_pchc              	      s  ddl }t d dtddf fdd}| d}|du r8|j|j }	|jjr0|	j	 }n|	j
 |	j }g }
|jjrI|
d|   n
|
d	|   dd
lm} |d}|D ]}|
d|   q_|r}|D ]}|
d|   qp|rt|jjdk rtd|jj |du rg }|d |r|D ]
}|
|d qt }|
dd |D  t|
}tj| |
 }t }|t||| d ddd |d}||| |||}| krt }|t| t|j}|| t d|j!  t }|"|t| t|j}|#|| t }|$||t| |jdurO|j! }nd}%t| |j&|fS )a  
    Compiles a CUDA kernel using NVRTC and returns the PTX code.

    Args:
        kernel_source (str): The CUDA kernel source code as a string
        kernel_name (str): The name of the kernel function to compile
        compute_capability (str, None): The compute capability to target (e.g., "86").
                                           If None, will detect from current device.
        cuda_include_dirs (list, None): List of directories containing CUDA headers
        nvcc_options (list, None): Additional options to pass to NVRTC
        auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)

    Returns:
        Tuple[bytes, str]: The compiled PTX code and mangled kernel name
    r   Nr(   r   c                    sL   |  kr$t  }| t | |jd ur|j nd}td| d S )Nr)   r*   )r   r+   rH   r,   r-   r.   r/   )r(   r0   r2   NVRTC_SUCCESSlibnvrtcr!   r"   check_nvrtc   s   

z#_nvrtc_compile.<locals>.check_nvrtcutf-8z--offload-arch=z--gpu-architecture=sm_)include_pathsr`   z-Iz12.8zPCH requires CUDA 12.8+, got z--pchc                 S   s   g | ]}| d qS )r~   )encoderj   r!   r!   r"   ro     s    z"_nvrtc_compile.<locals>.<listcomp>z.cuzKernel compilation failed:
rC   )'
torch.cudarg   r_   r   r`   get_device_propertiescurrent_devicer   r   gcnArchNamemajorminorappendrp   r   r   AssertionErrorrs   rq   r=   r   r+   c_void_prJ   r,   rX   rN   c_size_trT   create_string_bufferr-   rV   r/   r.   rP   rR   rZ   rL   raw)rt   ru   rv   rw   rx   ry   r   r}   source_bytespropsoptionsr   cuda_include_paths	cuda_path	directoryoptionnvrtc_compatible_flagsnum_optionsoptions_arrayprogc_kernel_namereslog_sizelogbinary_sizebinaryc_mangled_namemangled_namer!   rz   r"   _nvrtc_compile   s   





r   c                   @   s2   e Zd ZdejddfddZdeddfdd	ZdS )
_CudaModulemoduler   Nc                 C   s   || _ i | _d S r&   )_module_kernels)selfr   r!   r!   r"   __init__J  s   
z_CudaModule.__init__name_CudaKernelc              
   C   s   || j v r
| j | S ddlm} | }t }zt|t|| j|	d t
|| j}|| j |< |W S  tyJ } z	td| d|d }~ww )Nr   )r'   r~   zNo kernel named 'z' in this module)r   torch.cuda._utilsr'   r   r   r3   r   r,   r   r   r   r/   AttributeError)r   r   r'   r1   funckernelr>   r!   r!   r"   __getattr__N  s$   


z_CudaModule.__getattr__)__name__
__module____qualname__r   r   r   r   r   r!   r!   r!   r"   r   I  s    r   c                   @   s   e Zd ZdZdejdejddfddZ						dd
eeeef deeeef de	dB dede
dB ddfddZdeddfddZdS )r   zT
    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
    r   r   r   Nc                 C   s   || _ || _d| _d S )Nr   )r   r   _max_shared_mem_bytes)r   r   r   r!   r!   r"   r   l  s   
z_CudaKernel.__init__r5   r5   r5   r   gridblockargs
shared_memstreamc                 C   s  ddl }|jj }|sg }g }g }	|D ]Y}
t|
|jr?|
js*|
jr&|
 s*t	dt
|
 }|| |	t
| qt|
trRt
|
}|	t
| qt|
tret
|
}|	t
| qtdt|
 t
jt|	  }t|	D ]\}}
t
|
t
j||< qz|du rddl}|j }|dkr| jdks|| jkr| jdkrdnd| j d}td	| d
| dt|| j|d |d |d |d |d |d ||j|d dS )a  
        Call the compiled CUDA kernel

        Args:
            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
            block (tuple): Block dimensions (block_x, block_y, block_z)
            args (list): List of arguments to pass to the kernel.
                         PyTorch tensor arguments will be automatically converted to pointers.
            shared_mem (int): Shared memory size in bytes
            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
        r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type:    znot configuredzonly z bytes configuredzKernel requires z' bytes of shared memory (>= 48KB), but ze. Call kernel.set_shared_memory_config(shared_mem) after compilation and before launching the kernel.r5   rE   ) r   r`   _utilsr'   r;   Tensoris_cudais_cpu	is_pinned
ValueErrorr   r   data_ptrr   r,   r_   c_intfloatc_double	TypeErrortyper=   	enumeratecastr   current_streamr   r/   r3   r   r   _as_parameter_)r   r   r   r   r   r   r   r1   processed_argsc_argsargptrr   r   c_args_arrayiconfigured_msgr!   r!   r"   __call__q  sl   






z_CudaKernel.__call__shared_mem_bytesc                 C   s   |dk r	|| _ d S t }tj }tjjr|jdkrdnd}nt|dd}||kr4t	d| d| dd	}t
|| j|| || _ d S )
Nr   gfx950i   i  shared_memory_per_block_optinzRequested shared memory (z bytes) exceeds device limit (z= bytes). Consider reducing block size or shared memory usage.   )r   r'   r   r`   r   r   r   r   getattrr/   r3   r   r   )r   r   r1   device_propsmax_shared_mem+cudaFuncAttributeMaxDynamicSharedMemorySizer!   r!   r"   set_shared_memory_config  s4   

z$_CudaKernel.set_shared_memory_config)r   r   Nr   N)r   r   r   __doc__r   r   r   tupler_   listr   r   r   r!   r!   r!   r"   r   g  s,    
ar   ptxkernel_namesc           	   	   C   s   ddl }t }t| tr| d} t }|j }| t	|
t||  W d   n1 s2w   Y  |s=t|S i }|D ]}t }t	|t|||d t||||< qA|S )a,  
    Loads a CUDA module from PTX code and returns a module object that can access kernels.

    Args:
        ptx (bytes or str): The PTX code to load
        kernel_names (list, optional): List of kernel names to extract from the module.
                                      If None, will return a module object with __getattr__.

    Returns:
        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
    r   Nr~   )r   r'   r;   r   r   r   r   r`   r   r3   r   r,   r   r   r   )	r   r   r   r1   r   r   kernelsr   r   r!   r!   r"   _cuda_load_module  s*   


r   deviceoptional	allow_cpuc                 C   s   t | tr| S t | trt| } t | tjr2|r&| jdvr%td|  n| jdkr2td|  tj sAt | tj	jrA| j
S t| ||S )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

    If :attr:`device` is a torch.device object, returns the device index if it
    is a CUDA device. Note that for a CUDA device without a specified index,
    i.e., ``torch.device('cuda')``, this will return the current default CUDA
    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
    CPU devices will be accepted and ``-1`` will be returned in this case.

    If :attr:`device` is a Python integer, it is returned as is.

    If :attr:`device` is ``None``, this will return the current default CUDA
    device if :attr:`optional` is ``True``.
    )r`   cpuz(Expected a cuda or cpu device, but got: r`   z!Expected a cuda device, but got: )r;   r_   r   r   r   r   r   jitis_scriptingr`   idx_torch_get_device_index)r   r   r   r!   r!   r"   r   -  s   





r   )NNNFr&   )FF)"r   r   typingr   r   cuda.bindingsr   r7   r6   r   torch._utilsr   r   r   r#   r$   r'   r_   r3   rA   r\   rf   rg   r   r   rs   boolr   r<   r   r   r   dictr   r!   r!   r!   r"   <module>   s|    	

  

1