o
    m9:jX                  $   @   s  d Z ddlZddlmZ ddlmZmZ ddlZee	Z
g dZdee dB dee fdd	Zed
ddedefddZG dd deZejjdi d							dFdejdejdejdejdejdB dededededB dee dB dedejdB dejdB d edB deejejejf fd!d"Zej							dFdejdejdejdejdejdB dededededB dee dB dedejdB dejdB d edB deejejejf fd#d$Zddd%ddddd&dejdejdejdejdejdB deded'edB dedB deeef dedejdB dejdB d edB dejeejejf B fd(d)Zejjd*d+hd							dFd+ejdejdejdejdejdejdB dededededB dee dB dedejdB dejdB d edB dejf d,d-Zej							dFd+ejdejdejdejdejdejdB dededededB dee dB dedejdB dejdB d edB dejf d.d/Zddd%ddddd&d+ejdejdejdejdejdejdB deded'edB dedB deeef dedejdB dejdB d edB dejeejejf B f d0d1Zd2ed3eed4f d5eddfd6d7Zejjd8i d		dGd9ejdejdejdejd+ejd:ejdejdejdededed;ejdedB dee dB deejejejf fd<d=Zej		dGd9ejdejdejdejd+ejd:ejdejdejdededed;ejdedB dee dB deejejejf fd>d?Z d2ed9ejd@ejdAejdeejdB d4f f
dBdCZ!ej"e!edD ej#$ej%j&j' ddEl(m)Z)m*Z*m+Z+m,Z, e*e,ej%j-j< e+e,ej%j-j< e)e,ej%j-j< dS )Hz
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuple)varlen_attnvarlen_attn_out
AuxRequestwindow_sizereturnc                 C   s2   | d u rddg} t | dkrtdt |  | S )N   z$window_size must have length 2, got )len
ValueError)r    r   `/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/nn/attention/varlen.py_normalize_window_size   s
   r      )maxsizedevice_indexc                 C   s   dS )z;Cache device capability check to avoid repeated CUDA calls.Fr   )r   r   r   r   _should_use_cudnn   s   r   c                   @   s   e Zd ZU dZdZeed< dS )r   z
    Request which auxiliary outputs to compute from varlen_attn.

    Each field is a boolean indicating whether that auxiliary output should be computed.
    FlseN)__name__
__module____qualname____doc__r   bool__annotations__r   r   r   r   r   #   s   
 r   ztorch_attn::_varlen_attn)mutates_argsFquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalscale
enable_gqa	seqused_kblock_table
num_splitsc                 C   s*  t |	}	| jot| jj}|ratd |
rtd|dur"td|	d dks.|	d dkr2td|dus:|dur>td	tj	j
j| ||d||||d
d|d|d}|d |d |d }}}n%td tj	j
j| ||||||d|d||	d |	d |||d\}}}}}tjdtj| jd}|||fS )z
    Private custom op for variable-length attention.

    This is the internal implementation. Users should use the public varlen_attn function instead.
    #Using cuDNN backend for varlen_attnz,GQA is not supported with the cuDNN backend.Nz3num_splits is not supported with the cuDNN backend.r   r
      TcuDNN backend does not support window attention. Please use Flash Attention backend.zBseqused_k/block_table is not yet supported with the cuDNN backend.T        Fr%      -Using Flash Attention backend for varlen_attn)return_debug_maskr%   window_size_leftwindow_size_rightr'   r(   r)   r   dtypedevice)r   is_cudar   r7   indexloginfoRuntimeErrortorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r   r   r   r    r!   r"   r#   r$   r%   r   r&   r'   r(   r)   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_r   r   r   _varlen_attn-   sn   



rK   c                 C   s   t |	}	t| }| d}| d}tj||ftj| jd}tjjrCtj	
 }|tj	jjkrC|dd }tj|||ftj| jd}tjdtj| jd}|||fS )z
    Fake implementation for meta tensor computation and tracing.

    Based on the 3D varlen path from meta__flash_attention_forward:
    - query shape: (total, num_heads, head_dim)
    - logsumexp shape: (num_heads, total_q)
    r   r+   r5   r4   )r   r=   
empty_likesizeemptyfloatr7   versionhip_C_get_rocm_fa_preferred_backend_ROCmFABackendAOTritonrC   )r   r   r   r    r!   r"   r#   r$   r%   r   r&   r'   r(   r)   rF   total_q	num_heads	logsumexp	preferred
batch_sizerH   r   r   r   _varlen_attn_fake   s    




r[   )r
   r
   )
return_auxr%   r   r&   r'   r(   r)   r\   c                C   s   |  d}|dur| dn| d}|
s$||kr$td| d| d|
r7|| dkr7td| d| d	|	d
k}tjj| ||||||||t|	|
|||\}}}|dur_|jr_||fS |S )a  Compute variable-length attention using Flash Attention.

    This function is similar to scaled_dot_product_attention but optimized for
    variable-length sequences using cumulative sequence position tensors.

    Args:
        query (Tensor): Query tensor; shape :math:`(T_q, H_q, D)`
        key (Tensor): Key tensor; shape :math:`(T_k, H_{kv}, D)`, or
            :math:`(\text{total\_pages}, \text{page\_size}, H_{kv}, D)` when ``block_table`` is provided.
        value (Tensor): Value tensor; shape :math:`(T_k, H_{kv}, D)`, or
            :math:`(\text{total\_pages}, \text{page\_size}, H_{kv}, D)` when ``block_table`` is provided.
        cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
        cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
        max_q (int): Maximum query sequence length in the batch.
        max_k (int): Maximum key/value sequence length in the batch.
        return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.
        scale (float, optional): Scaling factor for attention scores
        window_size (tuple[int, int], optional): Window size for sliding window attention as (left, right).
            Use (-1, -1) for full attention (default), (-1, 0) for causal attention,
            or (W, 0) for causal attention with sliding window of size W.
        enable_gqa (bool): If set to True, enables Grouped Query Attention (GQA)
            and allows key/value to have fewer heads than query.
            Each KV head is shared by a group of :math:`H_q / H_{kv}` query heads,
            so :math:`H_q` must be divisible by :math:`H_{kv}`.
            Default is False.
        seqused_k (Tensor, optional): Number of valid KV tokens per batch element; shape :math:`(N,)`.
            When set, only the first ``seqused_k[i]`` tokens in the key/value sequence for batch
            element *i* participate in attention. Useful for KV-cache decoding where the cache slot
            is larger than the actual sequence. Inference-only (not supported in backward).
        block_table (Tensor, optional): Block table for paged KV cache; shape
            :math:`(N, \text{max\_pages\_per\_seq})`, dtype ``int32``.
            Requires ``seqused_k``. Inference-only (not supported in backward).

            When ``block_table`` is provided, ``key`` and ``value`` are a "pool" of
            pages of tokens of KV data and the pages belong to any sequence/order.
            The ``block_table`` is what maps each sequence's logical chunks
            back to physical pages in this pool.

            ``seqused_k[i]`` tells the kernel how many tokens in sequence *i* are
            actually valid, since the last page is typically only partially filled.
        num_splits (int, optional): Number of splits for split-KV. Set to ``1``
            to disable split-KV which enables batch invariance. Split-KV
            parallelizes the key/value sequence dimension across multiple thread
            blocks and combines partial results. The split decision depends
            on ``max_k`` (the longest sequence in the batch), so different batch
            compositions can change the reduction order and produce different
            floating-point results for the same sequence. When this is disabled,
            bitwise identical outputs are guaranteed for a given sequence
            regardless of what other sequences are in the batch, at the
            cost of lower GPU utilization when there are few queries. When
            ``None`` (default), the kernel chooses automatically.

    Returns:
        output (Tensor): Output tensor from attention computation; shape :math:`(T_q, H_q, D)`.

        If ``return_aux`` is not None and ``return_aux.lse`` is True:
            lse (Tensor): Log-sum-exp of attention scores; shape :math:`(T_q, H_q)`.

    Shape legend:
        - :math:`N`: Batch size
        - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
        - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
        - :math:`H_q`: Number of query attention heads
        - :math:`H_{kv}`: Number of key/value attention heads (equal to :math:`H_q` unless GQA is enabled)
        - :math:`D`: Head dimension

    Example::

        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
        >>> head_dim = embed_dim // num_heads
        >>> seq_lengths = []
        >>> for _ in range(batch_size):
        ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
        ...     seq_lengths.append(min(length, max_seq_len))
        >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
        >>> total_tokens = seq_lengths.sum().item()
        >>>
        >>> # Create packed query, key, value tensors
        >>> query = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> key = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> value = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>>
        >>> # Build cumulative sequence tensor
        >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
        >>> cu_seq[1:] = seq_lengths.cumsum(0)
        >>> max_len = seq_lengths.max().item()
        >>>
        >>> # Call varlen_attn
        >>> output = varlen_attn(
        ...     query, key, value, cu_seq, cu_seq, max_len, max_len
        ... )
    r+   Nr   GExpect query and key/value to have the same number of heads but got Hq=	 and Hkv=&. Try setting enable_gqa=True for GQA.r   MExpect number of query heads to be a multiple of kv heads for GQA but got Hq=.r
   r   )rM   r   r=   r>   
torch_attnrK   listr   )r   r   r   r    r!   r"   r#   r\   r%   r   r&   r'   r(   r)   num_heads_qnum_heads_kr$   outr   rI   r   r   r   r      sL   
u
r   ztorch_attn::_varlen_attn_outrg   c                 C   sl   t |
}
|jot|jj}|rtdtd tj	j
j| |||||||d|d|	|
d |
d |||d}|S )z
    Private custom op for variable-length attention with pre-allocated output.
    Same as _varlen_attn but writes the attention output into the provided out tensor.
    z+cuDNN backend does not support out variant.z1Using Flash Attention backend for varlen_attn_outr-   Fr   r+   )r%   r2   r3   r'   r(   r)   )r   r8   r   r7   r9   r<   r:   r;   r=   r>   r?   +_flash_attention_forward_no_dropout_inplace)rg   r   r   r   r    r!   r"   r#   r$   r%   r   r&   r'   r(   r)   rD   rG   r   r   r   _varlen_attn_outR  s2   
ri   c                 C   sx   | d}| d}tj||ftj|jd}tjjr:tj }|tjj	j
kr:| dd }tj|||ftj|jd}|S )F
    Fake implementation for meta tensor computation and tracing.
    r   r+   r5   )rM   r=   rN   rO   r7   rP   rQ   rR   rS   rT   rU   )rg   r   r   r   r    r!   r"   r#   r$   r%   r   r&   r'   r(   r)   rV   rW   rX   rY   rZ   r   r   r   _varlen_attn_out_fake  s   


rk   c                C   s   | d}|dur| dn| d}|s$||kr$td| d| d|r7|| dkr7td| d| d	|
d
k}tjj| |||||||||	t|
||||}|dur]|jr]| |fS | S )zCompute variable-length attention using Flash Attention with a pre-allocated output tensor.

    Same as :func:`varlen_attn` but writes the attention output into the provided ``out`` tensor
    instead of allocating a new one.

    r+   Nr   r]   r^   r_   r   r`   ra   rb   )rM   r   r=   r>   rc   ri   rd   r   )rg   r   r   r   r    r!   r"   r#   r\   r%   r   r&   r'   r(   r)   re   rf   r$   r   r   r   r   r     sN   
r   ctxinputs.rF   c                 C   s   |\}}}}}}}	}
}}}}}}|\}}}|d urt d|d ur%t d| |||||||| || _|	| _|
| _|| _|| _d S )Nz)seqused_k is an inference-only parameter.z+block_table is an inference-only parameter.)r<   save_for_backwardr"   r#   r$   r%   r   )rl   rm   rF   r   r   r   r    r!   r"   r#   r$   r%   r   r&   r'   r(   r)   rg   r   rH   r   r   r   _setup_context  s6   

ro   z!torch_attn::_varlen_attn_backwardgrad_outr   rH   c                 C   s   t |}tjd|jd}|jot|jj}|rFtd |d dks(|d dkr,t	dtj
jj| |||||||||	d|
|||d\}}}n$td	 tj
jj| |||||||||	d|
||||d |d d
\}}}|||fS )Nr   )r7   r*   r
   r+   r,   r-   r.   r0   )r%   r2   r3   )r   r=   rN   r7   r8   r   r9   r:   r;   r<   r>   r?   _cudnn_attention_backward_flash_attention_backward)rp   r   r   r   rg   r   r    r!   r"   r#   r$   rH   r%   r   unusedrD   dqdkdvr   r   r   _varlen_attn_backward  s^   


rw   c                 C   s0   t |}t|}t|}t|}|||fS )rj   )r   r=   rL   )rp   r   r   r   rg   r   r    r!   r"   r#   r$   rH   r%   r   
grad_querygrad_key
grad_valuer   r   r   _varlen_attn_backward_fakeQ  s
   



r{   grad_lsegrad_rngc                 C   sz   | j \}}}}}}	}
}| j}| j}| j}| j}| j}tjj	|||||	|
||||||||\}}}d}|||gd| R S )N   )N)
saved_tensorsr"   r#   r$   r%   r   r=   r>   rc   rw   )rl   rp   r|   r}   r   r   r   r    r!   rg   r   rH   r"   r#   r$   r%   r   rt   ru   rv   
num_paramsr   r   r   	_backwardn  s0   
r   )setup_context)_varlen_attn_backward_flop_varlen_attn_forward_flop_varlen_attn_out_flopflop_registry)FNNFNNN)NN).r   logging	functoolsr   typingr   r   r=   	getLoggerr   r:   __all__rd   intr   r   r   r   library	custom_opTensorrO   tuplerK   register_faker[   r   ri   rk   r   ro   rw   r{   r   register_autograd_dynamodisallow_in_graphr>   r?   rh   torch.utils.flop_counterr   r   r   r   rc   r   r   r   r   <module>   s   
	
		

Y		

:	


 
	

5
	

/


"=!	

D	


!