o
    m9:jR                     @  s  U d Z ddlmZ ddlZddlZddlmZ erddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlZdd	lmZ d
dlmZ dgZdaded< daded< e
G dd dZedyddZ	dzd{ddZd|ddZd}d d!Zd~d.d/Zdd6d7Zdd>d?Z ed@Z!ddCdDZ"ddFdGZ#						dddRdSZ$	TdddYdZZ%			ddd[d[ddddd\dd]	dd`daZ&dd[d[dddddbddcddZ'dd[d[ddddddeddfdgZ(ddddhddkdlZ)				m	T	TdddnddodpZ*	m	T	TdddnddqdrZ+ddnddudvZ,ej-dwedx dS )z
PROTOTYPE!
Flash Attention 3 implementation.
For fp8: only supports forward pass right now.
For fp16/bf16: supports forward and backward pass.
    )annotationsN)TYPE_CHECKING)Callable)	dataclass)cache)TypeVarTupleUnpack)Library   )	_registryregister_flash_attention_fa3zCallable | None_FA3_CUDA_FWD_FA3_CUDA_BWDc                   @  s    e Zd ZU ded< dddZdS )	
_FA3HandlezLibrary | NonelibraryreturnNonec                 C  s   d | _ tjd d S )NF)r   torch_C_set_sdp_use_fa3)self r   ^/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/nn/attention/_fa3.pyremove*   s   z_FA3Handle.removeN)r   r   )__name__
__module____qualname____annotations__r   r   r   r   r   r   &   s   
 r   devicetorch.devicer   intc                 C  s   t j| \}}|S N)r   cudaget_device_capability)r   major_r   r   r   _get_device_major0   s   r&   flash_attn_interfacemodule_pathstrc                 C  s   t |  tjd tt S )z
    Register FA3 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA3 implementation.
    T)_fa3_import_moduler   r   r   r   _fa3_register_kernelsr(   r   r   r   r   6   s   	
r   c                 C  sz   t |  ttjdstd|  dttjjds"td|  dttjjds1td|  dtjjjatjjj	a
d S )Nflash_attn_3zModule 'z' does not expose FA3 kernelsfwdz%' does not expose FA3 forward kernelsbwdz&' does not expose FA3 backward kernels)	importlibimport_modulehasattrr   opsRuntimeErrorr-   r.   r   r/   r   r,   r   r   r   r*   G   s   



r*   r	   c                  C  sr   t ddd} | dtd | dtd | dtd | dtd | dtd | d	td | d
td | S )NatenIMPLCUDAz"_flash_attention_forward.quantizedz-_scaled_dot_product_flash_attention.quantized_flash_attention_forward+_flash_attention_forward_no_dropout_inplace#_scaled_dot_product_flash_attention_flash_attention_backward,_scaled_dot_product_flash_attention_backward)	r	   impl!_fa3_flash_attention_forward_impl4_fa3_scaled_dot_product_flash_attention_forward_impl)_fa3_flash_attention_forward_impl_default4_fa3_flash_attention_forward_no_dropout_inplace_impl<_fa3_scaled_dot_product_flash_attention_forward_impl_default"_fa3_flash_attention_backward_impl5_fa3_scaled_dot_product_flash_attention_backward_impl)libr   r   r   r+   X   s:   r+   querytorch.Tensortensorstuple[torch.Tensor, ...]	dropout_pfloat	cum_seq_qtorch.Tensor | None	q_descale	k_descale	v_descale
str | Nonec                 C  s   |dkrdS t dd |D sdS tdd |D dkrd	S | jtjkr6|d u s0|d u s0|d u r6td
t |d u rB|  dkrBdS |d urN|  dkrNdS tj	
 sUdS t| jdkr^dS d S )N        zdropout_p must be 0c                 s  s    | ]}|j V  qd S r!   )is_cuda.0tr   r   r   	<genexpr>   s    z,_fa3_common_support_error.<locals>.<genexpr>zinputs must be CUDA tensorsc                 S     h | ]}|j qS r   )r   rT   r   r   r   	<setcomp>       z,_fa3_common_support_error.<locals>.<setcomp>r
   inputs must share devicezWhen using SDPA with fp8, descale tensor should always be used for accurate dequantization. Please use _scaled_dot_product_attention_quantized and provide the descale tensors.   zdense query must be 4D   zragged query must be 3DzCUDA not available	   z#FA3 requires compute capability 9.0)alllendtyper   float8_e4m3fnwarningswarnUserWarningdimr"   is_availabler&   r   )rF   rH   rJ   rL   rN   rO   rP   r   r   r   _fa3_common_support_errory   s*   	
rh   keyvaluereturn_debug_maskboolalibi_slopes	seqused_kc                   s   |rdS |d ur
dS |d ur|j tjkrdS |jsdS tjtjtjf t fdd| ||hD s6d  S tdd	 | ||hD d
krFdS t	| | ||f||||	|
}|d ur_|dkr]dS |S d S )Nzreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDAc                 3      | ]}|j  v V  qd S r!   ra   rT   supported_dtypesr   r   rW          z-_fa3_forward_support_error.<locals>.<genexpr>inputs must be one of c                 S  rX   r   rp   rT   r   r   r   rY      rZ   z-_fa3_forward_support_error.<locals>.<setcomp>r
   #all inputs must have the same dtyper[   z(query, key, value must be on same device)
ra   r   int32rS   rb   float16bfloat16r_   r`   rh   )rF   ri   rj   rJ   rk   rm   rn   rL   rN   rO   rP   errorr   rq   r   _fa3_forward_support_error   s8   
	rz   grad_outout	logsumexpwindow_size_left
int | Nonewindow_size_rightc
                   s   |j tjkr		 dS |j tjkrdS tjtjf t fdd| ||||hD s,d  S tdd | ||||hD dkr>d	S t|| |||||f||d d d }
|
d urT|
S d S )
NzHFA3 backward does not support fp8 - use inference only (torch.no_grad())zlogsumexp dtype must be float32c                 3  ro   r!   rp   rT   rq   r   r   rW      rs   z._fa3_backward_support_error.<locals>.<genexpr>rt   c                 S  rX   r   rp   rT   r   r   r   rY      rZ   z._fa3_backward_support_error.<locals>.<setcomp>r
   ru   )	ra   r   rb   float32rw   rx   r_   r`   rh   )r{   rF   ri   rj   r|   r}   rJ   rL   r~   r   ry   r   rq   r   _fa3_backward_support_error   s,    
 	r   Ts
Unpack[Ts]tuple[Unpack[Ts]]c                  G  s   t dd | D S )Nc                 s  s    | ]	}| d dV  qdS )r
      N)	transposerT   r   r   r   rW      s    z#_transpose_dense.<locals>.<genexpr>)tuple)rH   r   r   r   _transpose_dense   s   r   xc                 C  s"   | dur|  ddkr|  S | S )z2Ensure tensor is contiguous in the last dimension.Nr
   )stride
contiguous)r   r   r   r   _maybe_contiguous   s   "r   cu_seq_qcu_seq_kmax_qmax_kscalefloat | None	is_causalblock_table
num_splits!tuple[torch.Tensor, torch.Tensor]c                 C  sP  t du rtdt| }t|}|jtjkr(|ddkr(|ddkr(| nt|}t|}t|}t|}t|}t g |||ddd|||dd||||ddddd||||||	durv|	nd|
dur~|
nddddd|pt rdnddtj	
 pdR  \}}}}|| fS )	zF
    Run the FA3 forward pass by calling the C++ kernel directly.
    NFA3 not registeredr   r
   r   rR   T)r   r4   r   ra   r   rb   r   r   $are_deterministic_algorithms_enabledr   _get_sm_carveout_experimental)rF   ri   rj   r   r   r   r   r   r   r~   r   rn   r|   rN   rO   rP   r   r   qkvcu_seqlens_qcu_seqlens_ksoftmax_lse	out_accumsoftmax_lse_accumr   r   r   _fa3_run_forward   s   	
 "#%r   Fmax_seqlen_qmax_seqlen_kdeterministic/tuple[torch.Tensor, torch.Tensor, torch.Tensor]c                 C  s   t d u rtdt| }|ddkr| n|}|ddkr$| n|}|ddkr1| n|}t|}t|}t|}t|}t|}t |||||||||||d d ||	|
|||d|tj pfd |||fS )Nr   r   r
   rR   r   )	r   r4   r   r   r   r   
empty_liker   r   )r{   rF   ri   rj   r|   r}   r   r   r   r   r   r   r~   r   r   doutr   r   r   olsedqdkdvr   r   r   _fa3_run_backwardL  sH   



r   r   T	r   r~   r   rn   rm   r|   r   compute_auxiliaryr   	cum_seq_kr   c       	         C  s   t | ||||	||||
||}|d urtd| t| |||||||||||||
||||\}}|rQtjdtj| jd}tjdtj| jd}tjd| j| jd}nd }d }d }|||||fS )Nz)FA3 flash_attention forward unsupported: )r   )ra   r   r   r   )	rz   r4   r   r   zerosuint64r   emptyra   )rF   ri   rj   rL   r   r   r   rJ   r   rk   rN   rO   rP   r   r~   r   rn   rm   r|   r   r   r   ry   r   	rng_statephilox_offset
debug_maskr   r   r   r>     sV   r>   )r   r~   r   rn   rm   r   r   c                C  sF   t |||||||||	|
d d d f|||||| |d|d	\}}}}}|S )NFr   r>   )r|   rF   ri   rj   rL   r   r   r   rJ   r   rk   r   r~   r   rn   rm   r   r   r%   r   r   r   r   rA     s4   rA   )r   r~   r   rn   rm   r   r|   r   c
                C  s2   t | |||||||||	d d d |
|||||||dS )N)r   r~   r   rn   rm   r|   r   r   r   )rF   ri   rj   rL   r   r   r   rJ   r   rk   r   r~   r   rn   rm   r   r|   r   r   r   r   r@     s.   r@   )r   r~   r   r   unusedc                C  s   t | ||||||
|||
}|durtd| t }t| |||||||||	|||dur/|nd|dur6|nd|\}}}|||fS )z0FA3 implementation of _flash_attention_backward.Nz*FA3 flash_attention backward unsupported: r   )r   r4   r   r   r   )r{   rF   ri   rj   r|   r}   rL   r   r   r   rJ   r   r   r   r   r~   r   ry   r   r   r   r   r   r   r   rC   ,  sB   

rC   rR   r   c	                C  s   t | ||||d d d |||}
|
d urtd|
 t| ||\}}}| jtjkr+tjn| j}tj| |d}|dd}|	d}|	d}t
|||d d ||||||	||||d\}}}}}| 	d}|	d}||d d |||||f	S )NzFA3 SDPA forward unsupported: rp   r
   r   )r   r|   rN   rO   rP   )rz   r4   r   ra   r   rb   rx   r   r   sizer>   )rF   ri   rj   rN   rO   rP   rJ   r   rk   r   ry   r   r   r   	out_dtypeout_bhsdout_bshdmax_q_flashmax_k_flashr%   r   r   r   r   r   r   r   r   r   r?   g  sd   



r?   c                C  s   t | ||d d d ||||d
S )Nr   )r?   )rF   ri   rj   rJ   r   rk   r   r   r   r   rB     s   
rB   philox_seedr   c                C  s   t | ||||||
ddd
}|durtd| t| ||||\}}}}}t||||||dd||	|
||||d\}}}t|||\}}}|||fS )zCFA3 implementation of _scaled_dot_product_flash_attention_backward.NzFA3 SDPA backward unsupported: r   )r   r4   r   rC   )r{   rF   ri   rj   r|   r}   rL   r   r   r   rJ   r   r   r   r   ry   
grad_out_tq_tk_tv_tout_tr   r   r   dq_outdk_outdv_outr   r   r   rD     s6   

rD   FA3)register_fn)r   r   r   r    )r'   )r(   r)   r   r   )r(   r)   r   r   )r   r	   )rF   rG   rH   rI   rJ   rK   rL   rM   rN   rM   rO   rM   rP   rM   r   rQ   )rF   rG   ri   rG   rj   rG   rJ   rK   rk   rl   rm   rM   rn   rM   rL   rM   rN   rM   rO   rM   rP   rM   r   rQ   )r{   rG   rF   rG   ri   rG   rj   rG   r|   rG   r}   rG   rJ   rK   rL   rM   r~   r   r   r   r   rQ   )rH   r   r   r   )r   rM   r   rM   )NNNNNN)&rF   rG   ri   rG   rj   rG   r   rM   r   rM   r   r    r   r    r   r   r   rl   r~   r   r   r   rn   rM   r|   rM   rN   rM   rO   rM   rP   rM   r   rM   r   r   r   r   )F) r{   rG   rF   rG   ri   rG   rj   rG   r|   rG   r}   rG   r   rM   r   rM   r   r   r   r   r   r   r   rl   r~   r    r   r    r   rl   r   r   )NNN),rF   rG   ri   rG   rj   rG   rL   rM   r   rM   r   r    r   r    rJ   rK   r   rl   rk   rl   rN   rM   rO   rM   rP   rM   r   r   r~   r    r   r    rn   rM   rm   rM   r|   rM   r   rM   r   rl   r   r   )$r|   rG   rF   rG   ri   rG   rj   rG   rL   rM   r   rM   r   r    r   r    rJ   rK   r   rl   rk   rl   r   r   r~   r    r   r    rn   rM   rm   rM   r   rM   r   r   )$rF   rG   ri   rG   rj   rG   rL   rM   r   rM   r   r    r   r    rJ   rK   r   rl   rk   rl   r   r   r~   r    r   r    rn   rM   rm   rM   r   rM   r|   rM   r   r   )"r{   rG   rF   rG   ri   rG   rj   rG   r|   rG   r}   rG   rL   rM   r   rM   r   r    r   r    rJ   rK   r   rl   r   rG   r   rG   r   r   r~   r   r   r   )NNNrR   FF)rF   rG   ri   rG   rj   rG   rN   rM   rO   rM   rP   rM   rJ   rK   r   rl   rk   rl   r   r   )rR   FF)rF   rG   ri   rG   rj   rG   rJ   rK   r   rl   rk   rl   r   r   )r{   rG   rF   rG   ri   rG   rj   rG   r|   rG   r}   rG   rL   rM   r   rM   r   r    r   r    rJ   rK   r   rl   r   rG   r   rG   r   r   ).__doc__
__future__r   r0   rc   typingr   collections.abcr   dataclassesr   	functoolsr   typing_extensionsr   r   r   torch.libraryr	    r   __all__r   r   r   r   r&   r   r*   r+   rh   rz   r   r   r   r   r   r   r>   rA   r@   rC   r?   rB   rD   register_flash_attention_implr   r   r   r   <module>   s    	


!
%
+&

`FT<>?K(5