o
    m9:jA9                  
   @  s  U d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ dd	lmZ e
r7dd
lmZ ddlZddlmZ dgZdaded< eG dd dZedoddZ	dpdqddZedrddZdsdd Z	!dtdud*d+Z		dvdwd7d8Zdxd<d=Zdyd?d@Z edAZ!dzdDdEZ"			d{d|dPdQZ#	Rd}d~dUdVZ$ddddddddWddX	dd[d\Z%dddddddd]dd^d_Z&dddd`ddcddZ'	e	R	RdddfddgdhZ(ddfddkdlZ)ej*dmedn dS )zUBER PROTOTYPE!!!    )annotationsN)	dataclass)cache)AnyTYPE_CHECKING)TypeVarTupleUnpack   )	_registry)
ModuleType)Libraryregister_flash_attention_fa4
str | None_FA4_MODULE_PATHc                   @  s    e Zd ZU ded< dddZdS )	
_FA4HandlezLibrary | NonelibraryreturnNonec                 C  s
   d | _ d S N)r   )self r   ^/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/nn/attention/_fa4.pyremove"   s   
z_FA4Handle.removeN)r   r   )__name__
__module____qualname____annotations__r   r   r   r   r   r      s   
 r   devicetorch.devicer   intc                 C  s   t j| \}}|S r   )torchcudaget_device_capability)r   major_r   r   r   _get_device_major&   s   r%   flash_attn.cute.interfacemodule_pathstrc                 C  s   t | }| att S )z
    Register FA4 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA4 implementation.
    )_fa4_import_moduler   r   _fa4_register_kernels)r'   r$   r   r   r   r   ,   s   

r   c                 C  s2   t | }t|drt|dstd|  d|S )N_flash_attn_fwd_flash_attn_bwdzModule 'z' does not expose FA4 kernels)	importlibimport_modulehasattrRuntimeError)r'   moduler   r   r   r)   ;   s   
r)   r   c                  C  sV   t ddd} | dtd | dtd | dtd | dtd | dtd | S )	NatenIMPLCUDA_flash_attention_forward+_flash_attention_forward_no_dropout_inplace_flash_attention_backward#_scaled_dot_product_flash_attention,_scaled_dot_product_flash_attention_backward)r   impl!_fa4_flash_attention_forward_impl4_fa4_flash_attention_forward_no_dropout_inplace_impl"_fa4_flash_attention_backward_impl4_fa4_scaled_dot_product_flash_attention_forward_impl5_fa4_scaled_dot_product_flash_attention_backward_impl)libr   r   r   r*   C   s&   r*   r   querytorch.Tensortensorstuple[torch.Tensor, ...]	cum_seq_qtorch.Tensor | Nonerequire_fp32$tuple[tuple[str, torch.Tensor], ...]c                 C  s   t dd |D sdS tdd |D dkrdS | jtjtjfvr#dS |D ]\}}|jtjkr6| d	  S q%|d u rC|  d
krCdS |d urO|  dkrOdS tj	 sVdS t
| jdvr_dS d S )Nc                 s  s    | ]}|j V  qd S r   )is_cuda.0tr   r   r   	<genexpr>_   s    z,_fa4_common_support_error.<locals>.<genexpr>zinputs must be CUDA tensorsc                 S  s   h | ]}|j qS r   )r   rJ   r   r   r   	<setcomp>a   s    z,_fa4_common_support_error.<locals>.<setcomp>r	   inputs must share devicez'query dtype must be float16 or bfloat16z dtype must be float32   zdense query must be 4D   zragged query must be 3DzCUDA not available)	   
   z+FA4 requires compute capability 9.0 or 10.0)alllendtyper    float16bfloat16float32dimr!   is_availabler%   r   )rA   rC   rE   rG   nametensorr   r   r   _fa4_common_support_errorY   s&   
r^   keyvalue	dropout_pfloatreturn_debug_maskboolalibi_slopes	seqused_kblock_table
num_splits
int | Nonec
                 C  s   |dkrdS |r
dS |d urdS |d ur!|j tjkrdS |js!dS t| j}
|d ur4|
dkr4d|
 d	S |	d urF|	d
krF|
dkrFd|
 d	S t| | ||f|}|d ur[|dkrYdS |S d S )N        dropout_p must be 0zreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDArS   z+paged KV (block_table) not supported on SM 0r	   z-SplitKV (num_splits > 1) not supported on SM rO   z(query, key, value must be on same device)rV   r    int32rI   r%   r   r^   )rA   r_   r`   ra   rc   re   rf   rE   rg   rh   r#   errorr   r   r   _fa4_forward_support_errors   s4   
ro   grad_outout	logsumexpc           	      C  s>   |dkrdS t || |||||f|d|ffd}|d ur|S d S )Nrj   rk   rr   )rG   )r^   )	rp   rA   r_   r`   rq   rr   ra   rE   rn   r   r   r   _fa4_backward_support_error   s   
rs   valc                 C  s   | dkrdS | S )z"need to convert -1 to None for FA4Nr   )rt   r   r   r   _aten_to_fa4_window_size   s   rv   Ts
Unpack[Ts]tuple[Unpack[Ts]]c                  G  s   t dd | D S )Nc                 s  s    | ]	}| d dV  qdS )r	      N)	transposerJ   r   r   r   rM      s    z#_transpose_dense.<locals>.<genexpr>)tuple)rC   r   r   r   _transpose_dense   s   r}   cu_seq_qcu_seq_kmax_qmax_kscalefloat | None	is_causalwindow_size_leftwindow_size_right!tuple[torch.Tensor, torch.Tensor]c                 C  sz   t d u rtdtt }||t|	t|
d|||||d ur!| nd ||p&d|d}|j| ||fi |\}}|| fS )NFA4 not registeredTr	   )softmax_scalecausalr   r   
return_lsecu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krf   
page_tablerh   rq   )r   r0   r)   rv   
contiguousr+   )rA   r_   r`   r~   r   r   r   r   r   r   r   rf   rq   rg   rh   r1   kwargslser   r   r   _fa4_run_forward   s&   r   Fdeterministic/tuple[torch.Tensor, torch.Tensor, torch.Tensor]c                 C  sX   t d u rtdtt }|j||||| | ||	t|
t||||d\}}}|||fS )Nr   )r   r   r   r   r   r   r   )r   r0   r)   r,   r   rv   )rp   rA   r_   r`   rq   rr   r~   r   r   r   r   r   r   r1   dqdkdvr   r   r   _fa4_run_backward   s&   
r   T	r   r   r   rf   re   rq   rg   compute_auxiliaryrh   	cum_seq_kr   c
       	         C  s   t | ||||	|||||
}|d urtd| t| |||||||
|||||||\}}|rMtjdtj| jd}tjdtj| jd}tjd| j| jd}nd }d }d }|||||fS )Nz)FA4 flash_attention forward unsupported: )rz   )rV   r   r   r   )	ro   r0   r   r    zerosuint64r   emptyrV   )rA   r_   r`   rE   r   r   r   ra   r   rc   r   r   r   rf   re   rq   rg   r   rh   rn   r   	rng_statephilox_offset
debug_maskr   r   r   r;   
  sN   r;   )r   r   r   rf   re   rg   rh   c                C  s<   t |||||||||	|
|||||| |d|d\}}}}}|S )NFr   )r;   )rq   rA   r_   r`   rE   r   r   r   ra   r   rc   r   r   r   rf   re   rg   rh   r$   r   r   r   r   r<   J  s,   r<   )r   r   r   r   unusedc                C  sd   t | ||||||
|}|d urtd| t }t| ||||||||||||\}}}|||fS )Nz*FA4 flash_attention backward unsupported: )rs   r0   r    $are_deterministic_algorithms_enabledr   )rp   rA   r_   r`   rq   rr   rE   r   r   r   ra   r   r   r   r   r   r   rn   r   r   r   r   r   r   r   r=   w  s:   


r=   rj   r   c                C  s   t | ||||d d d }|d urtd| t| ||\}}	}
t| }|dd}|d}|	d}t||	|
d d |||||||d\}}}}}| d}|d}||d d |||||f	S )NzFA4 SDPA forward unsupported: r	   rz   )r   rq   )ro   r0   r}   r    
empty_liker{   sizer;   )rA   r_   r`   ra   r   rc   r   rn   qkvout_bhsdout_bshdmax_q_flashmax_k_flashr$   r   r   r   r   r   r   r   r   r   r>     sV   






r>   philox_seedr   c                C  s   t | ||||||
d }|d urtd| t||||| \}}}}}|d}|d}	t||||||d d ||	|
||||d\}}}t|||\}}}|||fS )NzFA4 SDPA backward unsupported: rz   r   )rs   r0   r}   r   r=   )rp   rA   r_   r`   rq   rr   rE   r   r   r   ra   r   r   r   r   rn   r   r   r   ogor   r   r   r   r   r   r?     sD   



r?   FA4)register_fn)r   r   r   r   )r&   )r'   r(   r   r   )r'   r(   r   r   )r   r   )r   )
rA   rB   rC   rD   rE   rF   rG   rH   r   r   )NN)rA   rB   r_   rB   r`   rB   ra   rb   rc   rd   re   rF   rf   rF   rE   rF   rg   rF   rh   ri   r   r   )rp   rB   rA   rB   r_   rB   r`   rB   rq   rB   rr   rB   ra   rb   rE   rF   r   r   )rt   ri   r   ri   )rC   rx   r   ry   )NNN) rA   rB   r_   rB   r`   rB   r~   rF   r   rF   r   ri   r   ri   r   r   r   rd   r   ri   r   ri   rf   rF   rq   rF   rg   rF   rh   ri   r   r   )F)rp   rB   rA   rB   r_   rB   r`   rB   rq   rB   rr   rB   r~   rF   r   rF   r   r   r   rd   r   ri   r   ri   r   rd   r   r   )&rA   rB   r_   rB   r`   rB   rE   rF   r   rF   r   r   r   r   ra   rb   r   rd   rc   rd   r   r   r   ri   r   ri   rf   rF   re   rF   rq   rF   rg   rF   r   rd   rh   ri   )$rq   rB   rA   rB   r_   rB   r`   rB   rE   rF   r   rF   r   r   r   r   ra   rb   r   rd   rc   rd   r   r   r   ri   r   ri   rf   rF   re   rF   rg   rF   rh   ri   )"rp   rB   rA   rB   r_   rB   r`   rB   rq   rB   rr   rB   rE   rF   r   rF   r   r   r   r   ra   rb   r   rd   r   rB   r   rB   r   r   r   ri   r   ri   )rj   FF)rA   rB   r_   rB   r`   rB   ra   rb   r   rd   rc   rd   r   r   )rp   rB   rA   rB   r_   rB   r`   rB   rq   rB   rr   rB   rE   rF   r   rF   r   r   r   r   ra   rb   r   rd   r   rB   r   rB   r   r   )+__doc__
__future__r   r-   dataclassesr   	functoolsr   typingr   r   typing_extensionsr   r    r
   typesr   r    torch.libraryr   __all__r   r   r   r%   r   r)   r*   r^   ro   rs   rv   rw   r}   r   r   r;   r<   r=   r>   r?   register_flash_attention_implr   r   r   r   <module>   s    
#
(

50M=7M6