o
    j9:j'                     @   s  d Z ddlZddlmZ ddlmZmZ ddlmZ ddl	Z	ddl
Z	ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZmZmZ ddlmZ ddl m!Z! de	j"j#de$e% fddZ&de	j"j'de(e	j)e	j"j*f fddZ+de	j"j'de%de,dB fddZ-de	j"j'de%de,dB fddZ.de	j"j'de%fddZ/de	j"j'de0e,dB  fddZ1de	j"j'd ee defd!d"Z2G d#d$ d$Z3e!d"e3 d% 	&	&d.d'ed(ef d)ee d*e4d+e4ded(ee f f
d,d-Z5dS )/a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)CallableSequence)Any)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendgreturnc                 C   s  dt ttf dtfdd}tt}d}t }| jD ]q}|jdkr<t||jt	j
r7|t||j  | |d7 }q|jdkrt|jd	sHq|jj}t|jD ]8\}}|t|jk rb|j| }	n|j|jvriqQ|j|j }	d
}
|jrz|jjrzd}
|
r||t||	j  O }qQq|S )Nmetar   c                 S   s   d| v r| d S | d S )Nvalfake_result )r   r   r   h/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fk7   s   z%find_input_mutations.<locals>.meta_fkr   placeholderr   call_function_schemaFT)dictstrr   r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr!   	enumerate	argumentslenargsnamekwargs
alias_infois_write)r   r   inputs	input_idxmutated_inputsnschemaiargargumentmut_argr   r   r   find_input_mutations6   s:   



r?   gmc                 C   sD   i }| j jD ]}|jdd }t|tjr|j|vr|||j< q|S )Nr   )graphr%   r   getr'   r(   r)   device)r@   device_node_mappingr9   tr   r   r   get_device_node_mapping]   s   
rF   	aot_model	num_fixedc                 C   s2   t | jtt| }|sd S t| j}t||S N)r?   rA   r$   ranger   r   )rG   rH   mutation_indicesplaceholdersr   r   r   3check_for_mutation_ignore_cuda_graph_managed_tensorh   s
   

rM   c                 C   sN   t jst| | }r|S tt|  }r|S t|  }r%td|j dS d S )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrM   r
   rF   r   r   r2   )rG   rH   mut_skipskipnoder   r   r   check_for_skips   s   rS   c                 C   s$   t tt| }|jdksJ |jS )Ncuda)nextiterrF   typeindex)r@   rC   r   r   r   get_device_index   s   rY   c                 C   s@   t | }t|jdksJ |jd }t|dsg S dd |D S )Nr   r   __iter__c                 S   s&   g | ]}t |tjjjr|jnd qS rI   )r'   r(   fxrR   Nodestack_trace).0r<   r   r   r   
<listcomp>   s    z$get_stack_traces.<locals>.<listcomp>)r   r0   r1   r,   )r@   outputr1   r   r   r   get_stack_traces   s   

ra   dynamo_modeldynamo_inputsc              	      s   ddl m tdtd  	ddtjjdtt dt	dtf fd	d
}dtjjdtt dtf fdd}t
||tj|ddtjjjd}|| S )Nr   )cudagraphify_implTFrG   
aot_inputsis_inferencer   c                    s   t | |}ttt|}t| | }r#t td|  |S  t|  ||t	| j
d|t| t| jt| jd	}d|_|S )Nskipping cudagraphs due to Fdevice_indexis_backwardrf   stack_tracesrL   mutated_input_idxsT)r   r   r0   rS   r   disabler   r$   rY   rJ   valuera   r   rA   r?   _boxed_call)rG   re   rf   interpfixedskip_msgoutboxed_device_indexrd   do_cudagraphsrc   r   r   forward_cudagraphs   s,   

z&cudagraphs.<locals>.forward_cudagraphsc                    s   t  |}s	 S t }t | }rFtd|  j}|d u r$d}tjjj|ddd us3J dt	t
 dt
f fdd}d	|_|S ||t|t d	dt t jt jd
	}d	|_|S )Nrg   r   F)create_if_none_existsr6   r   c                    s       | S rI   )set_to_running_backward)r6   rG   managerr   r   fn   s   z3cudagraphs.<locals>.backward_cudagraphs.<locals>.fnTrh   )r   r   rS   r   rn   r(   	_inductorcudagraph_treesget_managerlistr   ro   rJ   rY   ra   r   rA   r?   )rG   re   rp   rq   rr   
device_idxr|   rs   )ru   rd   rv   rz   r   backward_cudagraphs   s>   
z'cudagraphs.<locals>.backward_cudagraphs)rf   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesrd   r   r	   r(   r[   GraphModuler   r   boolr   	functoolspartial_dynamor   %cudagraph_backend_keep_input_mutation)rb   rc   rw   r   aot_cudagraphsr   rt   r   
cudagraphs   s6   ,
r   c                   @   s@   e Zd ZdZed
ddZedejjde	e
 de
fdd	ZdS )CudagraphsBackendr   r   Nc                  C   s   ddl m}  |   d S )Nr   reset_cudagraph_trees)r   r   r   r   r   r   reset   s   
zCudagraphsBackend.resetmodelr6   c                 C   s
   t | |S rI   )r   )r   r6   r   r   r   __call__   s   
zCudagraphsBackend.__call__)r   N)__name__
__module____qualname__compiler_namestaticmethodr   r(   r[   r   r   r   r   r   r   r   r   r      s    $r   )r2   compiler_fnTr   .r6   copy_outputscopy_inputsc                    s$  t |ttfs	J  rdd |D nt|tj  tj }|tj  tj	| | |  W d   n1 s>w   Y  |  tj | tj  tj
 tjj|d |  W d   n1 spw   Y  t ttfsfdtdtt f fdd}|S )	zBThis isn't registered as a backend, but is used in some benchmarksc                 S   s   g | ]}t |qS r   )r(   
zeros_liker^   xr   r   r   r_     s    z$cudagraphs_inner.<locals>.<listcomp>N)stream
new_inputsr   c                     sT   t t | ks
J  rt| D ]	\}}|| q  r(dd D S S )Nc                 S   s   g | ]}|  qS r   )cloner   r   r   r   r_   '  s    z1cudagraphs_inner.<locals>.run.<locals>.<listcomp>)r0   zipcopy_replay)r   dstsrcr   r   rA   static_inputsstatic_outputsr   r   run   s   zcudagraphs_inner.<locals>.run)r'   r   tupler(   rT   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphrA   r   r   )r   r6   r   r   r   r   r   r   r   cudagraphs_inner  s*   





"r   )TT)6__doc__r   collectionsr   collections.abcr   r   typingr   r(   torch.fxtorch._dynamor   torch._dynamo.backends.commonr    torch._dynamo.backends.debuggingr   torch._inductor.cudagraph_utilsr	   r
   r   r   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   r[   Graphr$   intr?   r   r"   rC   r\   rF   r#   rM   rS   rY   r   ra   r   r   r   r   r   r   r   r   <module>   s\     '

X
