o
    m9:j                  	   @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZmZ g dZedd	ZG d
d dejZdejfdejdededefddZ G dd deZ!G dd deZ"da#dd Z$da%dd Z&dS )    N)
namedtuple)Callable)Any))sparse_semi_structured_from_dense_cutlass'sparse_semi_structured_to_dense_cutlass)fallback_dispatchersemi_sparse_addmmsemi_sparse_clonesemi_sparse_detachsemi_sparse_indicessemi_sparse_linearsemi_sparse_mmsemi_sparse_scaled_mmsemi_sparse_tsemi_sparse_tosemi_sparse_to_copysemi_sparse_valuessemi_sparse_view)SparseSemiStructuredTensor!SparseSemiStructuredTensorCUTLASS$SparseSemiStructuredTensorCUSPARSELTto_sparse_semi_structured_SEMI_STRUCTURED_SPARSE_CONFIGz=sparse_min_rows sparse_min_cols dense_min_rows dense_min_colsc                   @   s  e Zd ZU dZdZeed< eej	e
f ed< dZeed< dZeed< dZeed< eed	< eeef ed
< ejdB ed< ejdB ed< ejdB ed< ejdB ed< ejdB ed< eed< eed< g dZe			d1dejdejdB dejdB dejdB dejdB dejdB dededefddZdefddZdeee eejeeef f fddZedeejeeef dejfddZejjZedefd d!Z ed2d3d"d#Z!ed$ejddfd%d&Z"d'd( Z#eefd$ejd)edd fd*d+Z$dd,d-ejd.ejdB dejfd/d0Z%dS )4r   a  
    This class implements semi-structured sparsity as a Tensor subclass.

    Semi-structured sparsity describes a sparsity pattern where n in every 2n elements are sparse,
    depending on the datatype. It is also referred to as 2:4 sparsity or fine-grained
    structured sparsity.

    There are two backends available for semi_structred sparsity, either cuSPARSELt or CUTLASS.
    This class is meant to serve as a base class for both implementations. SparseSemiStructuredCUTLASS
    and SparseSemiStructuredCUSPARSELT both inherit from this class and define three backend-specific items.
    Note that as such, this class cannot be instantiated directly.

    -`_DTYPE_SHAPE_CONSTRAINTS` - A dictionary holding backend specific dense/sparse min shape constraints
    - `def from_dense()` - backend specific compression routines
    - `def _mm()` - backend specific mm op (either torch._cslt_sparse_mm or torch._sparse_semi_structured_(mm|addmm))
    r   _DEFAULT_ALG_ID_DTYPE_SHAPE_CONSTRAINTSF_FORCE_CUTLASS_FUSE_TRANSPOSE_PROTOTYPE_WARNING_SHOWNBACKENDSPARSE_DISPATCHNpackedmetapacked_tmeta_tcompressed_swizzled_bitmaskfuse_transpose_cusparseltalg_id_cusparselt)r    r!   r"   r#   r$   shaperequires_gradc
                 C   s   | j stjdtdd d| _ |   tj|  |dur|}
n|dur&|}
ntdtj	j
| ||
j|
j|
j|	d}||_||_||_||_||_||_||_|S )a0  
        Create a new instance of the tensor subclass from the compressed sparse representation.

        We have the option to create the subclass with the compressed representations of both X and X', for training.
        For inference, we only need a single representation (either X or X'), while the corresponding other set will be None.

        Depending on the backend selected, certain fields will be set to None. (CUSPARSELT vs CUTLASS)

        Args:
            shape: The shape of the original dense tensor
            packed: The compressed representation of the original dense tensor
            meta: The metadata of the original dense tensor, if it is stored separately
            packed_t: The compressed representation of the transposed original dense tensor
            meta_t: The metadata of the transposed original dense tensor, if it is stored separately
            compressed_swizzled_bitmask: The masks used by the CUTLASS backend to determine which threads should
                                         participate in the computation. Used for pointwise ops.
            fuse_transpose_cusparselt: When running with cuSPARSELt, we have the option to fuse a transposition
                                       with a matmul, which is useful in the case of 2:4 sparse training.
            alg_id_cusparselt: The algorithm id to use when using cuSPARSELT, will have effect on performance

        Returns:
            torch.Tensor: A torch.Tensor wrapper subclass.

        Raises:
            ValueError: If all of the tensor arguments are None.
        zThe PyTorch API of SparseSemiStructuredTensor is in prototype stage and will change in the near future. Please open a Github issue for features requests and see our documentation on the torch.sparse module for further information about the project.   
stacklevelTNz3At least one of packed or packed_t must be provided)devicedtypelayoutr(   )r   warningswarnUserWarning_load_dispatch_tabletorch_dynamoallow_in_graph
ValueErrorTensor_make_wrapper_subclassr,   r-   r.   r    r!   r"   r#   r$   r%   r&   )clsr'   r    r!   r"   r#   r$   r%   r&   r(   previous_tensortensor r<   c/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/sparse/semi_structured.py__new__O   s<   '
	z"SparseSemiStructuredTensor.__new__returnc                 C   s(   t | ds	td| jj d| j dS )Nr'   ztensor has no shape attributez(shape=))hasattrAssertionError	__class____name__r'   selfr<   r<   r=   __repr__   s   
z#SparseSemiStructuredTensor.__repr__c                    s4   t t fdd j} j j j jf}||fS )Nc                    s   t  | d uS N)getattr)xrE   r<   r=   <lambda>   s    z?SparseSemiStructuredTensor.__tensor_flatten__.<locals>.<lambda>)listfilter	__slots__r'   r%   r&   r(   )rF   inner_tensorstensor_metar<   rE   r=   __tensor_flatten__   s   z-SparseSemiStructuredTensor.__tensor_flatten__rP   c           	      C   sN   |\}}}}| || dd | dd | dd | dd | dd |||d	S )Nr    r!   r"   r#   r$   	r'   r    r!   r"   r#   r$   r%   r&   r(   )get)	r9   rO   rP   
outer_sizeouter_strider'   r%   r&   r(   r<   r<   r=   __tensor_unflatten__   s   



z/SparseSemiStructuredTensor.__tensor_unflatten__c                 C   s:   |j | jvrt| j d|j d| j|j  ||||S )NzI only supports a specific set of operations, can't perform requested op (r@   )_overloadpacketr   NotImplementedErrorrD   )r9   functypesargskwargsr<   r<   r=   __torch_dispatch__   s   z-SparseSemiStructuredTensor.__torch_dispatch__c                 C   s   t | dddu rbtjjjttjjjttjjjt	tjjj
t	tjjjttjjjttjjjttjjjttjjjttjjjttjjjttjjjttjjjttjjjttjjjti| _ |durd| j !| dS dS dS )zT
        Loads the op overload sparse dispatch table for the current class.
        r   N)"rI   r3   opsatenvaluesr   indicesr   is_same_sizer   detach_detachr
   tr   viewr   mmr   matmuladdmmr   linearr   _to_copyr   
_scaled_mmr   cloner	   tor   r   update)r9   custom_dispatch_tabler<   r<   r=   r2      s*   














z/SparseSemiStructuredTensor._load_dispatch_tableoriginal_tensorc                 C   s   |j std|j d| dkrtd|  d| s$td|j| jvr6td|j d|  d	|j\}}| j|j j}| j|j j	}||k sY|| sY||k sY|| rhtd
|j d| d| ddS )z_
        Assert that the given tensor is valid for semi-structured sparse compression.
        zError original_tensor.device= z= is not supported! Only CUDA tensors are currently supported.r)   zError original_tensor.dim = z; is not supported! Only 2d tensors are currently supported.zXError original_tensor is not contiguous!Only contiguous tensors are currently supported.zError original_tensor.dtype z is not a supported dtype for !zError original_tensor.shape zS is not supported! Both dimensions must be larger or equal than and a multiple of (z, r@   N)
is_cudaRuntimeErrorr,   dimis_contiguousr-   r   r'   sparse_min_rowssparse_min_cols)r9   rq   mnmin_rowsmin_colsr<   r<   r=    _validate_device_dim_dtype_shape   s8   
 
z;SparseSemiStructuredTensor._validate_device_dim_dtype_shapec                 C   s&   | j d }t| tj|| j| jdS )Nr-   r,   )r'   r3   rg   eyer-   r,   )rF   colr<   r<   r=   to_dense  s   
z#SparseSemiStructuredTensor.to_densealg_idc                 C      t rH   rX   r9   rq   r   r<   r<   r=   
from_dense#  s   z%SparseSemiStructuredTensor.from_dense)biasBr   c                K   r   rH   r   )rF   r   r   r\   r<   r<   r=   _mm+  s   zSparseSemiStructuredTensor._mm)Fr   FrH   )r?   N)&rD   
__module____qualname____doc__r   int__annotations__dictr3   r-   r   r   boolr   r   strr   r7   rN   staticmethodSizer>   rG   tuplerL   rQ   classmethodrV   _C_disabled_torch_function_impl__torch_function__r   r]   r2   r}   r   r   r   r<   r<   r<   r=   r   *   s   
 		
T
*r   Frq   
transposedr   r?   c                 C   s8   |r
t jdtdd tjrtjjntjj}|j	| |dS )a	  
    This function converts a dense tensor into a sparse semi-structured tensor.
    It will return a SparseSemiStructuredTensor, a subclass of torch.Tensor.

    This function will check to ensure the dense tensor has the right dtype, size, dims, and device.
    We currently only support semi-structured sparse tensors for 2d CUDA tensors.
    Additionally, your tensor must be a positive multiple of the minimum sparse block size, given in
    `_DTYPE_TO_SHAPE_CONSTRAINTS` for each dtype (float32, float16, bfloat16, int8).

    Args:
        original_tensor (Tensor): the dense tensor to convert
        transposed (bool, optional): deprecated arg to be removed in another release. Do not use.
        alg_id (int, optional): the algorithm id to use for cuSPARSELt matmul. Defaults to 0.
            Can be obtained via ``torch._cslt_sparse_mm_search``.
    Returns:
        SparseSemiStructuredTensor: A sparse semi-structured tensor created from the given original_tensor
    Raises:
        None
    Example:
        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> A = torch.Tensor([0, 0, 1, 1]).tile((128, 32)).half().cuda()
        tensor([[0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.],
                ...,
                [0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.]], device='cuda:0', dtype=torch.float16)
        >>> A_sparse = to_sparse_semi_structured(A)
        SparseSemiStructuredTensor(shape=torch.Size([128, 128]))
        >>> A_sparse.values()
        tensor([[1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.],
                ...,
                [1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0', dtype=torch.float16),
        >>> A_sparse.indices()
        tensor([[-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                ...,
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370]], device='cuda:0', dtype=torch.int16))
    zSetting transpose from `to_sparse_semi_structured` is deprecated and will be removed in a future release. `SparseSemiStructuredTensor` only support contiguous input tensors.r)   r*   )r   )
r/   r0   FutureWarningr   r   r3   sparser   r   r   )rq   r   r   SPARSE_SUBCLASSr<   r<   r=   r   5  s   4
r   c                       s   e Zd ZdZdZejeddddejeddddej	eddddej
eddddiZeejfd	ejd
edd fddZ fddZe	dd	ejddfddZddddejdejdB dedejfddZ  ZS )r   a  
    This class implements semi-structured sparsity for the CUTLASS backend.


    In this implementation, the specified elements and metadata are stored separately,
    in packed and meta respectively.

    When _FORCE_CUTLASS is set, or when cuSPARSELt is not available, this subclass calls into _sparse_semi_structured_(mm|addmm) and
    sparse_semi_structured_from_dense for conversion to the compressed format.
    cutlass          @         rq   r   r?   c              	   C   s0   |  | t|\}}| |j||d d d |jdS )Nr    r!   r"   r#   r$   r(   )r}   r   r'   r(   )r9   rq   r   sparse_tensor_cutlassmeta_tensor_cutlassr<   r<   r=   r     s   
z,SparseSemiStructuredTensorCUTLASS.from_densec                    s@   | j d u s
| jd u rtd| j jdkrt| j| j S t  S )Nz meta and packed must not be Noner)   )r!   r    rB   ndimr   superr   rE   rC   r<   r=   r     s   z*SparseSemiStructuredTensorCUTLASS.to_dense r   c              	   C   s2   t j||dd\}}}}}| |j|||||ddS )a~	  
        This function takes in a unpruned dense tensor and runs a (branchless) static sort across a 4x4 tile.

        It greedily picks the largest values in the tile, upholding the 2:4 sparsity constraint across both rows and columns.
        The algorithm used to prune the matrix is implemented in `_sparse_semi_structured_tile`.

        Then it creates the packed and meta tensors for the compressed sparse representation of the pruned dense tensor.
        It also calculates the packed_t and meta_t tensors for the compressed sparse representation of the transposed
        pruned dense tensor.
        Since we cannot transpose the compressed representations, we store both for the fw/bw pass respectively.

        Finally, this function also computes a compressed swizzled bitmask that encodes the sparsity pattern
        This can be used in the backward pass to mask the gradients.

        [9 1 7 4]                       [9 0 7 0]
        [1 2 3 0]                       [0 2 0 0]
        [8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to CUTLASS semi-structured -> packed
        [1 2 6 2]                       [0 0 6 2]                                    -> metadata

                                                  -> pack to transposed CUTLASS      -> packed_t
                                                     semi-structured representation  -> metadata_t

                                                  -> compute swizzled bitmask        -> compressed_swizzled_bitmask


        The equivalent PyTorch code to create the same five outputs from the dense tensor can be found below:
        ```
        from torch.sparse import SparseSemiStructuredTensorCUTLASS
        from torch.sparse._semi_structured_conversions import (
            _sparse_semi_structured_tile,
            _compute_compressed_swizzled_bitmask,
        )

        pruned = _sparse_semi_structured_tile(dense)
        packed_cutlass, meta_cutlass = sparse_semi_structured_from_dense_cutlass(pruned)
        packed_t_cutlass, meta_t_cutlass = sparse_semi_structured_from_dense_cutlass(
            pruned.t().contiguous()
        )
        bitmask = _compute_compressed_swizzled_bitmask(pruned)

        SparseSemiStructuredTensorCUTLASS(
            dense.shape,
            packed_cutlass,
            meta_cutlass,
            packed_t_cutlass,
            meta_t_cutlass,
            bitmask,
        )
        ```
        T	algorithmuse_cutlassFr   )r3   _sparse_semi_structured_tiler'   r9   rq   r   r    r!   r"   r#   r$   r<   r<   r=   prune_dense_static_sort  s$   =z9SparseSemiStructuredTensorCUTLASS.prune_dense_static_sortNFr   should_transpose_denser   r   r   c             
   K   s   t |tr	td| jj}| jdks|jdkrtd| d| jd u s)| jd u r1td| dt	  | j
|j }tjj|| j| j|| jd |j|j|S )NZ`SparseSemiStructuredTensor @ SparseSemiStructuredTensor` is not supported by the hardwarer)   `)` matmul: Broadcasting is not implemented$` matmul: operation is not supportedr   )
isinstancer   r6   rC   rD   r   rX   r    r!   _ensure_cutlass_mm_registeredr   r-   r3   r^   semi_structured
cutlass_mmr'   dense_min_rowsdense_min_cols)rF   r   r   r   r\   cls_nameconstraintsr<   r<   r=   r     s2   


z%SparseSemiStructuredTensorCUTLASS._mmr   )rD   r   r   r   r   r3   int8r   float16bfloat16float32r   r   r   r   r7   r   r   r   r   r   r   __classcell__r<   r<   r   r=   r   |  sH    Or   c                   @   s   e Zd ZdZdZejeddddejeddddej	eddddej
eddddiZeejfdejdedd fd	d
Ze	ddejddfddZddddejdejdB dedejfddZdS )r   a  
    The cuSPARSELt backend expects the specified elements and the metadata to be stored in a single tensor:
    packed = [ specified elements of original tensor | metadata ]
    For an original tensor of size (m, k) we expect the first m * k // 2 elements to be the kept elements
    The rest of the tensor is metadata. Since there is only one tensor, we only use the packed and packed_t
    attributes respectively.

    cuSPARSELt also supports transposition fusion, which is necessary for performant 2:4 sparse training, as well
    as specifying alg_id, a config that affects the performance of the matmul depending on matmul sizes.
    
cusparseltr   r   r   rq   r   r?   c                 C   s0   |  | | |jt|d d d d tj||jd	S )NrR   )r}   r'   r3   _cslt_compressr   r   r(   r   r<   r<   r=   r   6  s   
z/SparseSemiStructuredTensorCUSPARSELT.from_denser   r   c              	   C   sV   t j||dd\}}}}}||jd d}||jd d}| |j|||||ddS )a=  
        This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPARSELt metadata
        layout and sparse matmul.

        The only functional difference is that cuSPARSELt stores `metadata` and `packed` together into a single tensor.

        [9 1 7 4]                       [9 0 7 0]
        [1 2 3 0]                       [0 2 0 0]
        [8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to cuSPARSELT semi-structured -> packed
        [1 2 6 2]                       [0 0 6 2]

                                                  -> pack to transposed cuSPARSELt      -> packed_t
                                                     semi-structured representation

                                                  -> compute swizzled bitmask           -> compressed_swizzled_bitmask


        The equivalent PyTorch code to create the same three outputs from the dense tensor can be found below:
        ```
        from torch.sparse import SparseSemiStructuredTensorCUSPARSELT
        from torch.sparse._semi_structured_conversions import (
            _sparse_semi_structured_tile,
            _compute_compressed_swizzled_bitmask,
        )

        pruned = _sparse_semi_structured_tile(dense)
        packed_cusparselt = torch._cslt_compress(pruned)
        packed_t_cusparselt = torch._cslt_compress(pruned.t().contiguous())
        bitmask = _compute_compressed_swizzled_bitmask(pruned)

        SparseSemiStructuredTensorCUSPARSELT(
            dense.shape, packed_cutlass, None, packed_t_cutlass, None, bitmask
        )
        ```
        Fr   r   r~      r   )r3   r   rf   r'   r   r<   r<   r=   r   J  s(   -z<SparseSemiStructuredTensorCUSPARSELT.prune_dense_static_sortNFr   r   r   r   c                K   sp  t |tr	td| jdks|jdkrtd| jj d|j| jkrAtd| jj dt| j	 dt|j	 d| j d|j d	|d uri|j| jkritd| jj dt| j	 dt|j	 d
| j d|j d| jt
jkrtd| jj dt| j	 dt|j	 d| j d	| jd u rtd| jj dt  | j|j }t
jj|| j|| j	d |j|j| j| j|	S )Nr   r)   r   r   z` matmul: trying to do `A=z @ B=z`, with A.dtype=z and B.dtype=zH. This operation is only supported when A and B have the same data type.z + C`, with A.dtype=B.dtype=z and C.dtype=zK. This operation is only supported when A, B and C have the same data type.z`, with A.dtype=B.dtype=zO. mm is not supported for float8_e4m3fn, please use `torch._scaled_mm` instead.r   r   )r   r   r6   r   rX   rC   rD   r-   r   r'   r3   float8_e4m3fnr     _ensure_cusparselt_mm_registeredr   r^   r   cusparselt_mmr   r   r%   r&   )rF   r   r   r   r\   r   r<   r<   r=   r     s^   
$$$
z(SparseSemiStructuredTensorCUSPARSELT._mmr   )rD   r   r   r   r   r3   r   r   r   r   r   r   r   r   r   r7   r   r   r   r   r   r<   r<   r<   r=   r   "  sF    Dr   c                  C   s   t rdS da ddlm}  | ddddtjd	tjd
tjdtjdB dtdtdtdtdtjfdd}|jdtjd	tjd
tjdtjdB dtdtdtdtdtjfdd}dS )zLazily register the cutlass_mm custom op.

    Registration is deferred to avoid importing torch.library at module load
    time, since torch.sparse is imported early during ``import torch``.
    NTr   	custom_opzsemi_structured::cutlass_mmr<   mutates_argsdenser    r!   r   out_featuresr{   r|   r   r?   c                 S   s   | j \}}	| | }
|	 | }|
dkp|dk}| }|r'tjj| d|d|
f}|r-| n|}|d u r;t|||}nt||||}|r[|rI|n|	}|d | dd|j	tj
dS | S )Nr   r   memory_format)r'   r3   nn
functionalpadre   _sparse_semi_structured_mm_sparse_semi_structured_addmmnarrowrm   contiguous_format
contiguous)r   r    r!   r   r   r{   r|   r   ry   rz   to_pad_mto_pad_nneed_paddense_paddedmm_inputresout_colsr<   r<   r=   r     s&   




z1_ensure_cutlass_mm_registered.<locals>.cutlass_mmtranspose_densec           	      S   s.   |r| j d n| j d }tj||| j| jdS Nr   r   r   r'   r3   emptyr-   r,   )	r   r    r!   r   r   r{   r|   r   r   r<   r<   r=   _cutlass_mm_fake  s   z7_ensure_cutlass_mm_registered.<locals>._cutlass_mm_fake)_cutlass_mm_registeredtorch.libraryr   r3   r7   r   r   register_fake)r   r   r   r<   r<   r=   r     sX   
		r   c                  C   s   t rdS da ddlm}  | ddd	dd	tjd
tjdtjdB dtdtdtdtdtdtdtjfdd}|jd	tjd
tjdtjdB dtdtdtdtdtdtdtjfdd}dS )z,Lazily register the cusparselt_mm custom op.NTr   r   zsemi_structured::cusparselt_mmr<   r   Fr   r    r   r   r{   r|   fuse_transposer   r   r?   c	                 S   s   | j \}	}
|	 | }|
 | }|dkp|dk}| }|r'tjj| d|d|f}|r-| n|}tj|||||d}|r?| }|rS|rE|	n|
}|dd|jtj	dS |
 S )Nr   )r   transpose_resultr   r   r   )r'   r3   r   r   r   re   _cslt_sparse_mmr   rm   r   r   )r   r    r   r   r{   r|   r   r   r   ry   rz   r   r   r   r   r   r   r   r<   r<   r=   r     s.   


z7_ensure_cusparselt_mm_registered.<locals>.cusparselt_mmc	           
      S   s.   |r| j d n| j d }	tj||	| j| jdS r   r   )
r   r    r   r   r{   r|   r   r   r   r   r<   r<   r=   _cusparselt_mm_fake6  s   z=_ensure_cusparselt_mm_registered.<locals>._cusparselt_mm_fake)F)_cusparselt_mm_registeredr   r   r3   r7   r   r   r   )r   r   r   r<   r<   r=   r   	  sb   

	
#	
r   )'r/   collectionsr   collections.abcr   typingr   r3   )torch.sparse._semi_structured_conversionsr   r   !torch.sparse._semi_structured_opsr   r   r	   r
   r   r   r   r   r   r   r   r   r   __all__r   r7   r   r   r   r   r   r   r   r   r   r   r   r<   r<   r<   r=   <module>   sD   <  
G ' !A