o
    j9:j_                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZ d dlmZ dd	lmZm Z  erd d
l!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( ddl)m*Z* d dl+Z+d dl,Z,d dl-Z,d dl.m/  m0Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9 d dl:m;Z; d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZDmEZEmFZF d dlGmHZH ddlImJZJmKZKmLZLmMZMmZmNZN ddlOmPZP ddlQmRZRmSZSmTZT ddlUmVZVmWZW ddlMmXZXmYZYmZZZm[Z[ ddl\m]Z]m^Z^ ddl_m`Z` ddlmaZambZbmcZcmdZdmeZemfZf ddlgmhZh dd limjZjmkZk dd!llmmZmmnZn dd"lompZpmqZq dd#lrmsZs dd$l/mtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZ dd%lmZ eeZe,jed&Ze,jed'Ze,jed(Ze,jed)Zed* Zd+ed,< ed-Zed.ZejG d/d0 d0ZejG d1d2 d2ZG d3d4 d4ZejG d5d6 d6ZejG d7d8 d8eZG d9d* d*Zejdd<d=Zdd@dAZddCdDZddFdGZejdHdIG dJdK dKZddNdOZG dPdQ dQZddXdYZG dZd[ d[eZG d\d] d]eZG d^d_ d_eZddbdcZddhdiZG djdk dkeZG dldm dmeZG dndo doeZG dpdq dqeZG drds dseZ	tddd|d}ZdddZdddZdddZdddZejG dd dZe ZdddZdddZdddZdddZdddZdddZdddZdddZdddZG dde deZG dd dZdS )    )annotationsN)Counterdefaultdict)as_completedFuture)AnyGenericTYPE_CHECKING	TypeAliasTypeVar)	ParamSpec
OrderedSet   )ComputedBuffer	Pointwise)CallableIteratorSequence)
ModuleType)EnterCudaStreamContextLine)PythonWrapperCodegen)countersdynamo_timed)use_pipelined_autotuning)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)get_stream_name)free_symbols)free_symbol_is_typesymbol_is_typeSymT)
has_triton)commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)assign_origin_nodeget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)DevicePropertiesReductionHint)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_current_backendget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsget_op_namesGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingcompute_dependencies
cudagraphsBaseSchedulerNoder
   PartitionType_T_Pc                   @  sZ   e Zd ZU dZded< dZded< dZded< dd	 ZedddZ	e	ddddZ
dS )FusionResultNzbool | Noneshould_fusezCallable[[], bool] | Nonecallable_fnLambdaFuture | Nonefuturec                 C  s    | j d u| jd uA sJ dd S )NzLFusion result should contain either fusion decision or callable_fn, not both)rc   rd   self ri   `/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/_inductor/scheduler.py__post_init__s   s   zFusionResult.__post_init__boolc                 C  s
   t |dS )N)rc   rb   )clsrc   ri   ri   rj   fusex   s   
zFusionResult.fuseCallable[[], bool]c                 C  s   t ||dS )Nrd   rf   rm   )rn   rd   rf   ri   ri   rj   from_callable|      zFusionResult.from_callable)rc   rl   N)rd   rp   rf   re   )__name__
__module____qualname__rc   __annotations__rd   rf   rk   classmethodro   rr   ri   ri   ri   rj   rb   m   s   
 rb   c                   @  s<   e Zd ZU ded< ded< ded< dZded< dddZdS )PendingFusionrp   rd   r^   node1node2Nre   rf   return+tuple[BaseSchedulerNode, BaseSchedulerNode]c                 C  s   | j | jfS rt   r{   r|   rg   ri   ri   rj   get_fusion_nodes      zPendingFusion.get_fusion_nodes)r}   r~   )ru   rv   rw   rx   rf   r   ri   ri   ri   rj   rz      s   
 rz   c                   @  s   e Zd ZdZed'ddZed(d	d
Zed)ddZed*ddZ	ed+ddZ
ed)ddZed,ddZed-ddZed)ddZed)dd Zed'd!d"Zed.d$d%Zd&S )/MixOrderReductionz
    This class contains utility functions to decide if we should fuse reductions
    reducing across different dimensions of the same input tensor.
    noder^   r}   rl   c                 C  s   |   otdd |  D S )Nc                 s  s:    | ]}t |tr| rt |jtr|jjd uV  qd S rt   )
isinstanceSchedulerNodeis_reductionr   r   _split_size.0subnoderi   ri   rj   	<genexpr>   s    


z7MixOrderReduction.is_split_reduction.<locals>.<genexpr>)r   all	get_nodesr   ri   ri   rj   is_split_reduction   s   z$MixOrderReduction.is_split_reductiontuple[sympy.Expr, sympy.Expr]c                 C  s   |  |r{d }d }| D ]c}t|tr| rt|jtsq|jjd us'J tj	j
t|jj}|jjd us:J tj	j
t|jj}|d u rN|}|}qtj	j
||s_J | d| tj	j
||spJ | d| q|d uswJ ||fS |jd S )N v.s. r   )r   r   r   r   r   r   r   _original_rangesrY   graphsizevarssimplifyrX   _original_reduction_rangesstatically_known_equalsgroup)rn   r   xnumelrnumelr   	curxnumel	currnumelri   ri   rj   get_numel_rnumel   sF   




z"MixOrderReduction.get_numel_rnumelr{   r|   c                 C  sL   |  |}|  |}t|dkst|dks||krdS t|tt|kS )N   F)r   lentuplereversed)rn   r{   r|   g1g2ri   ri   rj   has_mix_reduction_orders   s
   

 z*MixOrderReduction.has_mix_reduction_ordersbufstrc                 C  s   d}|j jD ]}t|tr|j|kr|} nq|sdS |j}|j j}|s7t|ts0J t| |j	d j j}|s;J t
|t
|j sFdS tjjt|jt| rWdS dS )z@
        The access to 'buf' is not a broadcast access.
        NFr   T)read_writesreadsr   r3   nameindex
var_rangesFusedSchedulerNodetypesnodesr   r!   rY   r   r   r   rX   sizevalues)rn   r   r   	found_depdepr   r   ri   ri   rj   _is_full_access   s*   z!MixOrderReduction._is_full_access	list[str]c                 C  sD   g }|  |  @ }|D ]}| ||r| ||r|| q|S rt   )used_buffer_namesr   append)rn   r{   r|   outcommon_readsr   ri   ri   rj   get_common_read   s   
z!MixOrderReduction.get_common_readc                 C  s   t | ||dkS Nr   )r   r   rn   r{   r|   ri   ri   rj   has_common_read   s   z!MixOrderReduction.has_common_readintc                 C  s(   |  |}tjjj|d |d  ddS )Nr   r   fallback)r   rY   r   r   optimization_hint)rn   r   r   ri   ri   rj   	get_numel  s   
zMixOrderReduction.get_numelc                 C  s
   |  |S rt   )r   r   ri   ri   rj   get_fusion_score	  s   
z"MixOrderReduction.get_fusion_scorec                 C  s  t jjsdS tjjrdS | r| sdS | j}|dvs%t	|dkr'dS |
 r/|
 s1dS |j| @ s?|j| @ rAdS | ||sIdS t||}t|dkrWdS | |rb||}}n| |rm||}}ndS | |}|\}}	t jjsd}
tjjt||	 |
sdS tjjt||	d sdS tjjt|dsdS tdd	 | D rdS tjj|	d
sdS t|rdS tdd	 | D }|S )zP
        Check whether we can fuse two reductions with mix loop orders.
        F)cudaxputritonr   i  P r   i   c                 s  s.    | ]}|  r|jjjtjtjfvV  qd S rt   )r   r   datareduction_hintrC   INNERDEFAULTr   ri   ri   rj   r   [  s    
z-MixOrderReduction.can_fuse.<locals>.<genexpr>i @  c                 s  s&    | ]}|  r|j d v V  qdS )>   sumprodN)r   r   get_reduction_typer   ri   ri   rj   r   q  s    
)r'   r   mix_order_reductionrY   r   cpp_wrapperrT   
get_devicer   rK   r   	ancestorsget_operation_namesr   r   r   r   is_contiguous_noder   #mix_order_reduction_non_strict_moder   guard_or_truesympyGeanyr   statically_known_leqr   r   )rn   r{   r|   device_typer   contiguous_node
other_noder   nrowncol
size_thresr   ri   ri   rj   can_fuse  s`   



	
	zMixOrderReduction.can_fusec                 C  s   |  ||S rt   )r   r   ri   ri   rj   are_mix_order_reductions|  rs   z*MixOrderReduction.are_mix_order_reductionsc                   s$   t  fddjjD sdS dS )Nc                 3  s    | ]
}  |jV  qd S rt   )is_contiguous_loadr   r   r   rn   r   ri   rj   r     s    
z7MixOrderReduction.is_contiguous_node.<locals>.<genexpr>FT)r   r   r   r   ri   r   rj   r     s
   z$MixOrderReduction.is_contiguous_nodeparent_nodec                   s   ddl m} | D ]N}t|tsJ |j}|j|j } fdd|D }t|dkr,q
|D ])}|j	| }	|j
}
t|
 }tjj|	||}|d dksW|d dksW  dS q.q
dS )	Nr   )MemoryUsageTypec                   s   g | ]
}|j  kr|jqS ri   )buffer_name
index_name)r   er   ri   rj   
<listcomp>      z8MixOrderReduction.is_contiguous_load.<locals>.<listcomp>r   FT)torch._inductor.loop_bodyr   r   r   r   _bodymemory_usageLOADr   indexing_exprsr   listkeysrY   r   r   stride_vars)rn   r   r   r   r   	loop_bodyentriesindex_namesr   
index_exprr   var_symbolsr   ri   r   rj   r     s,   
z$MixOrderReduction.is_contiguous_loadNr   r^   r}   rl   )r   r^   r}   r   r{   r^   r|   r^   r}   rl   )r   r   r   r^   r}   rl   )r{   r^   r|   r^   r}   r   )r   r^   r}   r   r{   r^   r|   r^   r}   r   )r   r   r   r^   r}   rl   )ru   rv   rw   __doc__staticmethodr   ry   r   r   r   r   r   r   r   r   r   r   r   ri   ri   ri   rj   r      s4    	%!jr   c                   @  s   e Zd ZU ded< ded< ded< ejedZded	< ejedZ	d
ed< d(ddZ
d)ddZd(ddZd(ddZd*ddZd+ddZd,ddZd-d d!Zd-d"d#Zd.d%d&Zd'S )/SchedulerBuffer	Scheduler	schedulerz	ir.Bufferr   BaseSchedulerNode | Nonedefining_op)default_factorylist[NodeUser]usersr@   
mpi_bufferr}   r   c                 C  s   | j }|d us	J | S rt   )r  get_name)rh   opri   ri   rj   defining_op_name  s   z SchedulerBuffer.defining_op_namer   c                 C  s   t | jjS rt   )hashr   r   rg   ri   ri   rj   __hash__  r   zSchedulerBuffer.__hash__c                 C  s  t  }|  }|| dt| jj  || d| jj  |  r3|| dt|    | 	 rE|| dt| 	   t
| jdkr[|| d| j  | S || d |d | jD ]
}|| d qlW d    n1 sw   Y  |d	 | S )
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rQ   r  	writeliner   r   ru   layoutget_aliasespformatget_mutationsr   r  indentgetrawvalue)rh   resultr   userri   ri   rj   	debug_str  s&   

zSchedulerBuffer.debug_strc                 C  
   | j  S rt   r   r  rg   ri   ri   rj   r       
zSchedulerBuffer.get_nameNonec                 C  s   | j d usJ | j  sd S | j  s!| j  s!t| j  tjr+tj	j
| j  d S ttjdra|  tjjv ratjj|   }|| jjv rO| jj| j }n| jj| j }tj	j
|| j  d S tj	j
| j  d S )Nargs)r   should_allocateget_inputs_that_alias_outputget_mutation_namesr   get_output_specr*   CommBufferLayoutrY   r   wrapper_codecodegen_allocationhasattrkernelr  inplace_update_buffersr  name_to_donated_buffername_to_bufcodegen_inplace_reuse)rh   input_buffer_nameinput_bufferri   ri   rj   allocate  s6   

zSchedulerBuffer.allocaterl   c                 C  sN   | j d usJ t| j jtjst| j rdS | jD ]}t|j tr$ dS qdS NFT)r   r   r  r*   r>   rU   r  
OutputNode)rh   useri   ri   rj   can_free  s   
zSchedulerBuffer.can_freec                 C  s\   i }|D ] }t |j|v r||t |j |t |j< q||t |j< qt| | _d S rt   )idr   merger   r   r  )rh   r  r  r1  ri   ri   rj   	set_users   s    zSchedulerBuffer.set_usersSequence[str]c                 C     | j d usJ | j  S rt   )r   r   rg   ri   ri   rj   r  
     
zSchedulerBuffer.get_aliasesc                 C  r7  rt   )r   r!  rg   ri   ri   rj   r    r8  zSchedulerBuffer.get_mutationstorch.device | Nonec                 C  s   | j   S rt   )r   r"  r   rg   ri   ri   rj   r        zSchedulerBuffer.get_deviceNr}   r   r}   r   r}   r  r}   rl   )r  r  r}   r  r}   r6  r}   r9  )ru   rv   rw   rx   dataclassesfieldr   r  r@   r  r
  r  r  r  r.  r2  r5  r  r  r   ri   ri   ri   rj   r     s$   
 





!



r   c                   @  s   e Zd ZU dZded< dS )SchedulerDonatedBufferNr  r  )ru   rv   rw   r  rx   ri   ri   ri   rj   rC    s   
 rC  c                   @  s  e Zd ZU ded< ded< ded< ded< ded< ded	< ded
< ded< ded< dZded< ded< ded< dZded< ded< ded< dZded< dd#d$Zdd&d'Zdd)d*Z	dd+d,Z
dd-d.Zdd0d1Zdd2d3Zdd4d5Zdd9d:Zdd<d=Zdd@dAZddBdCZddEdFZddIdJZddKdLZddMdNZddOdPZddQdRZddSdTZddWdXZddYdZZdd[d\Zedd]d^Zedd_d`ZeddadbZ eddcddZ!ddfdgZ"ddidjZ#ddmdnZ$ddpdqZ%ddrdsZ&ddtduZ'ddvdwZ(ddxdyZ)ddzd{Z*dd|d}Z+dd~dZ,dddZ-dddZ.dddZ/dddZ0	ddddZ1edddZ2edddZ3edddZ4dddZ5dddZ6edddZ7dddZ8edddZ9dddZ:dddZ;e<dddZ=dS )r^   OrderedSet[str]r   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]r   
last_usager   min_input_distancemax_input_distance	min_order	max_orderrA   mpi_nodedict[str, str]mutation_renamesNir.Operation | Noner   list[SchedulerBuffer]outputsdict[str, SchedulerBuffer]outputs_by_namefloat | Noneoverride_estimated_runtimedependencies.ReadWritesr   OrderedSet[Dep]unmet_dependenciesFrl   writtenr  r   r}   r  c                 C  s   || _ dd | _d S )Nc                  _  s   g S rt   ri   )r  kwargsri   ri   rj   <lambda>5  s    z,BaseSchedulerNode.__init__.<locals>.<lambda>)r  debug_device_strrh   r  ri   ri   rj   __init__2  s   zBaseSchedulerNode.__init__ir.Operationc                   s`   | _ t  _d _d _tt   _d _ fdd| D  _	dd  j	D  _
i  _d S )Nr   Fc                   s   g | ]
}t  j| d qS ))r  r   r  )r   r  )r   outputrg   ri   rj   r   A  s    z5BaseSchedulerNode._init_from_node.<locals>.<listcomp>c                 S     i | ]}|  |qS ri   r  r   r   ri   ri   rj   
<dictcomp>I      z5BaseSchedulerNode._init_from_node.<locals>.<dictcomp>)r   r   r   rF  rG  r   rE  rW  get_outputsrO  rQ  rL  rh   r   ri   rg   rj   _init_from_node8  s   

z!BaseSchedulerNode._init_from_noder   c                 C  s   t | j d|  dS )Nz(name=)r   ru   r  rg   ri   ri   rj   __repr__R     zBaseSchedulerNode.__repr__c                 C  s.  |   }t }|| dt| j dtt| ddj d| dt| jj d| dt| j	 d| d	t| jj
| j	  d| d
| j d| d| j d| d |  |  D ]	}||  qYW d   n1 smw   Y  |d z	||   W n ty   tjddd Y nw |  S )#Longer form printout for trace logsr  (r   N)

.writes = 
.unmet_dependencies = .met_dependencies = .min_input_distance = .max_input_distance = z.outputs = [
        r  Ignoring error in debug_str()Texc_info)r  rQ   splicer   ru   getattrr  r   writesrV  r   rF  rG  r  rd  r  r  debug_str_extra	Exceptionlogwarningr  rstrip)rh   r   r   r   ri   ri   rj   r  U  sX   


zBaseSchedulerNode.debug_strc                 C     dS )N ri   rg   ri   ri   rj   rz  p     z!BaseSchedulerNode.debug_str_extrar   c                 C  s
   |  | S rt   )rZ  rg   ri   ri   rj   _debug_str_for_devices  r  z'BaseSchedulerNode._debug_str_for_devicec                 C  sz   t | jdd }d}t|tjjjrd|j| gddd }nt|tjjj	r7d|j|
 | gddd }|  | S )Nr   r  , F)shorten	multiline)rx  r   r   torch	_inductorr*   r   
str_helperget_size	Reductionget_reduction_sizer   )rh   
maybe_datadata_strri   ri   rj   debug_str_shortv  s   
z!BaseSchedulerNode.debug_str_shortc                 C  s   t d| | j| jj d S )Nz(%s: unmet_dependencies = %s, writes = %s)r|  inforV  r   ry  rg   ri   ri   rj   log_details  s   zBaseSchedulerNode.log_detailsself_depr3   	other_depc                 C  r  NFri   )rh   r  r  ri   ri   rj   reorder_loops_by_dep_pair     z+BaseSchedulerNode.reorder_loops_by_dep_pairrenamesc                   s<    fdddd | j  D D | _| | j | j d S )Nc                      i | ]}| v r| | qS ri   ri   r   r   r  ri   rj   rb    
    z:BaseSchedulerNode.update_mutated_names.<locals>.<dictcomp>c                 s      | ]}|j V  qd S rt   r   r   ri   ri   rj   r         z9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>)r   reads_and_writesrL  set_read_writesrenamerh   r  ri   r  rj   update_mutated_names  s   
z&BaseSchedulerNode.update_mutated_namesr   r2   c                 C  s   |  | j| d S rt   )r  r   	with_readrh   r   ri   ri   rj   add_fake_dep     zBaseSchedulerNode.add_fake_depc                 C     t dd |  D S )Nc                 s  s     | ]}|  p| V  qd S rt   )r  r  ra  ri   ri   rj   r     s    
z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>)r   rd  rg   ri   ri   rj   has_aliasing_or_mutation  s   z*BaseSchedulerNode.has_aliasing_or_mutationrwc                 C  s   || _ | j j| _|   d S rt   )r   r   rV  
prune_deps)rh   r  ri   ri   rj   r    s   
z!BaseSchedulerNode.set_read_writesfuture_used_buffersmutation_real_namec                   s,   |   }t fdd|D }|| | _d S )Nc                 3  s    | ]	}  ||V  qd S rt   )get)r   kr  ri   rj   r         z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>)used_or_aliased_buffer_namesr   rE  )rh   r  r  used_buffersri   r  rj   set_last_usage  s   z BaseSchedulerNode.set_last_usagec                 C  s   | j D ]}|  qd S rt   )rO  r.  )rh   r   ri   ri   rj   mark_run  s   

zBaseSchedulerNode.mark_runc                 C  s"   t dd t| jj| jjD S )Nc                 s  r  rt   r  r   ri   ri   rj   r     
    
z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>)r   	itertoolschainr   r   ry  rg   ri   ri   rj   r     s   z#BaseSchedulerNode.used_buffer_namesc                   s   t   dd t| jj| jjD }t|dkr@| } | t	j
j|r:| fddt	j
j|  D  t|dks S )z
        Returns buffer names used by this node, including aliases.

        Note: is_fake WeakDeps are excluded since they are purely for ordering
        and should not affect buffer lifetime.
        c                 S  s"   g | ]}t |tr|js|jqS ri   )r   r5   is_faker   r   ri   ri   rj   r     s    zBBaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r   c                 3  s    | ]	}| vr|V  qd S rt   ri   )r   alias
used_namesri   rj   r     s    zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>)r   r  r  r   r   ry  r   popaddrY   r   name_to_bufferr  extendr   )rh   depsr   ri   r  rj   r    s    
z.BaseSchedulerNode.used_or_aliased_buffer_namesc                   s   t  fdd jD  _d S )Nc                 3  s"    | ]}|j  jjvr|V  qd S rt   )r   r  available_buffer_namesr   rg   ri   rj   r         z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>r   rV  rg   ri   rg   rj   r    s   zBaseSchedulerNode.prune_depsc                   s>   d	 fddt fdd jjD }  j| d S )
Nr   r2   r}   rl   c                   s>   t | tsdS | j jjvrdS  jj| j  }|tjjv S r  )	r   r5   r   r  r*  r
  rY   r   removed_operations)r   op_namerg   ri   rj   should_prune  s   
z7BaseSchedulerNode.prune_weak_deps.<locals>.should_prunec                 3      | ]	} |r|V  qd S rt   ri   r   r  ri   rj   r         
z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>r   r2   r}   rl   )r   r   r   r  remove_reads)rh   	to_removeri   )rh   r  rj   prune_weak_deps  s
   	z!BaseSchedulerNode.prune_weak_depsname_to_fused_nodedict[str, BaseSchedulerNode]c                 C  s   t | || jj d S rt   )_prune_redundant_depsr  r*  )rh   r  ri   ri   rj   prune_redundant_deps  s   z&BaseSchedulerNode.prune_redundant_depsc                 C  r7  rt   )r   get_operation_namerg   ri   ri   rj   r    r8  zBaseSchedulerNode.get_namec                 C  s   |   S rt   r`  rg   ri   ri   rj   get_first_name  s   z BaseSchedulerNode.get_first_namec                 C  r  )Nc                 s      | ]}|  V  qd S rt   r`  r   r   ri   ri   rj   r         z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>)r   r   rg   ri   ri   rj   r        z%BaseSchedulerNode.get_operation_namesc                 C     t dd | jD S )Nc                 s  r  rt   r`  r   r   ri   ri   rj   r     r  z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>)r   rO  rg   ri   ri   rj   get_buffer_names     z"BaseSchedulerNode.get_buffer_namesc                 C  r  )Nc                 s  s&    | ]}t |tot|d dV  qdS )T)disallow_fp32_opsNr   r   r,   r   nri   ri   rj   r      s    


zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>r   r   rg   ri   ri   rj   can_codegen_in_low_precision  s   z.BaseSchedulerNode.can_codegen_in_low_precisionc                 C  r  )Nc                 s  s"    | ]}t |tot|V  qd S rt   r  r  ri   ri   rj   r     s
    
z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>r  rg   ri   ri   rj   r,     s   z-BaseSchedulerNode.can_codegen_without_upcastsSequence[BaseSchedulerNode]c                 C  s   | gS rt   ri   rg   ri   ri   rj   r        zBaseSchedulerNode.get_nodesSequence[SchedulerBuffer]c                 C     | j S rt   )rO  rg   ri   ri   rj   rd    r  zBaseSchedulerNode.get_outputsbuf_namer   c                 C  s
   | j | S rt   )rQ  )rh   r  ri   ri   rj   
get_output  r  zBaseSchedulerNode.get_outputr9  c                 C  r7  rt   )r   r   rg   ri   ri   rj   r     r8  zBaseSchedulerNode.get_devicec                 C  s   |   }|d uo|jdkS Ncpu)r   r   rh   deviceri   ri   rj   is_cpu     zBaseSchedulerNode.is_cpuc                 C  s   |   }|d uot|jS rt   )r   rT   r   r  ri   ri   rj   rT     r  zBaseSchedulerNode.is_gpuc                 C  r  r  ri   rg   ri   ri   rj   r   "  r  zBaseSchedulerNode.is_reductionc                 C  r  r  ri   rg   ri   ri   rj   is_native_matmul%  r  z"BaseSchedulerNode.is_native_matmulc                 C  r  r  ri   rg   ri   ri   rj   is_split_scan(  r  zBaseSchedulerNode.is_split_scanc                 C  r  r  ri   rg   ri   ri   rj   is_template+  r  zBaseSchedulerNode.is_templatec                 C  r  r  ri   rg   ri   ri   rj   	is_extern.  r  zBaseSchedulerNode.is_externc                 C  r  r  ri   rg   ri   ri   rj   
is_foreach1  r  zBaseSchedulerNode.is_foreachread_depdependencies.Depc                 C  r  r  ri   rh   r  ri   ri   rj   can_inplace4  r  zBaseSchedulerNode.can_inplacec                 C  r  r  ri   rg   ri   ri   rj   has_side_effects7  r  z"BaseSchedulerNode.has_side_effectsc           	        s  ddl m} ttr1tjr1tj	 t
jr1ttjtjjjjr+ttjdddur1ttjds3dS jtjjB jjB  dfd
d} D ]}|j}|dusTJ | rp| sp| sp| tjjv spt| t j!rqqIj"j#D ]}|j$jj%v rjj%|j$ }njj&'|j$}|rEtjj()|rEt|j*t+sE|j,dusJ  fdd|j,D }j-|j$}|sEt.|dkrE|d j/rE|d ju rE|jdurEt|j t j0t j1t j2t j!fsE|j*rt|j*jt j3t j4frt.|j dksE||j|jrE||rEtjj56| |  ttjtjjjjr9tjj78|  tjj78|  | tjj9| <  nquqIdS )z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNr  buf_to_be_inplacedr   r}   rl   c                   s   | j }|   t }| jD ]3}|j}t|tsq| | j j	vs+| j ||ur,q| fdd|j
 D O }t|dkrC dS qdS )Nc                 3  s    | ]
}|j  kr|V  qd S rt   r  )r   or  ri   rj   r   m  s    
z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>r   FT)r  get_fused_noder  r   r  r   r   r^   r  r  r   r  r   )r  
fused_noder  r  	user_noderg   r  rj   single_index_in_fused_nodeU  s*   


zKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_nodec                   s   g | ]}|j   vr|qS ri   r  r   x)inconsequential_nodesri   rj   r     s
    z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>r   )r  r   r}   rl   ):codegen.wrapperr  r   r   r'   inplace_buffersrY   r   has_featurer   r-   INPLACE_BUFFERSr'  r  r  codegensimd
SIMDKernelrx  r&  r   r  r  completed_operationsrd  r   r  r   r!  r  removed_buffersr"  r*   r#  r   r   r   r)  r*  r  r$  	can_reuser  NopKernelSchedulerNoder  has_cross_stream_hazardr   r  r>   r=   MutationLayoutSHOULDREMOVEFallbackKernelr<   r  make_inplacer  r  r(  )	rh   r  r   r   buf_noderead	input_bufremaining_usesr  ri   )r  rh   rj   decide_inplace_update:  s   
"


z'BaseSchedulerNode.decide_inplace_updateTbufferrQ   	only_oncec           	      C  s(  t jsd S |r| jrd S | jd usJ | j }g }|D ]e}|jdkr$q|d |d d|j d|j }d|jv rG|d|jd   }|| d|jv r|jd  }|j	d	d
dd }|d|
dd
dd
dd
dd  |d |d qt|dkrd S || d| _d S )Nr^  r  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplitr   {z{{}z}}ro  \z\\z#pragma CMT END ORIGINr   T)r'   comment_originrW  r   get_originsr	  r   targetmetarsplitreplacer   
writelines)	rh   r  r  origins	out_linesr  op_info_strr  stack_trace_last_lineri   ri   rj   codegen_originating_info  sH   









	


z*BaseSchedulerNode.codegen_originating_infoc                 C  s   | j dddS )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrg   ri   ri   rj   get_read_write_buffers_sizes     z.BaseSchedulerNode.get_read_write_buffers_sizesc                 C     | j dddS )NTFr.  r1  rg   ri   ri   rj   get_read_buffer_sizes  r4  z'BaseSchedulerNode.get_read_buffer_sizesc                 C  r5  )NFTr.  r1  rg   ri   ri   rj   get_write_buffer_sizes   r4  z(BaseSchedulerNode.get_write_buffer_sizesr/  r0  c                 C  s   t | j||d ddS )Nr.  r   )start)r   get_read_write_buffer_accessesr   )rh   r/  r0  ri   ri   rj   r2    s   z3BaseSchedulerNode.get_read_write_buffers_sizes_impldict[str, int]c                   s
  t tri S t trt jtri S t tr+t jtjr+jjtj	j
ju r+i S dddt trHt d t d  ntd	tt}|rbjjD ]
}||j | qW|rsjjD ]
}||j | qh|rtd
d jjD nt }|rtdd jjD nt }dfddt trtfdd|D }|| }|| }i }||B D ]I}	tfdd||	 D  |	tjjv rtjj|	 }
n|	tjjv rtjj|	 }
nqd fdd|
}|	|vr|||	< q||	  |7  < q|S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        s
sympy.Exprr}   r   c                 S  s   t jjj| ddS )Nr   r   )rY   r   r   r   )r;  ri   ri   rj   try_size_hint:     zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hintr   r       eAc                 s  r  rt   r  r   ri   ri   rj   r   O  r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>c                 s  r  rt   r  r   ri   ri   rj   r   T  r  r   r   r   r  rl   c                   s4    j j|  j}tdd |D }t|t| dkS )Nc                 s  r  rt   r   r   r  ri   ri   rj   r   [  r  z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>r   )r  r*  r  r   r   )r   r   r  buf_usesrg   ri   rj   is_materializedY  s   zIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materializedc                 3  s     | ]} |j s|V  qd S rt   r   r   )rB  rh   ri   rj   r   _  s    
c                 3  s    | ]} V  qd S rt   ri   r   )
node_numelri   rj   r   h      4ir.Buffer | ir.TensorBox | ir.TorchBindObject | Nonec                   s   | sdS t | tjr|  S t | jtrNjj|   j	}d}|D ]*}t |j
tr*q!t |j
ts2J t |j
j
trI|j
 D ]	}||j
7 }q>q! dS |S t | jtjrbtfdd|  D S t|  }t|  t | S )Nr   c                 3  s     | ]} t j|V  qd S rt   )rY   r   
get_buffer)r   mut_name)get_buf_bytesri   rj   r     s
    
zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>)r   r*   TorchBindObjectrI  r  r=   r  r*  r  r  r   r0  r^   r<   rd  r>   r   r!  rX   r  rM   	get_dtypemin)r   r  totr  	sched_buf	buf_elems)buf_accessed_elemsrI  rh   r=  ri   rj   rI  q  s2   zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytesN)r;  r<  r}   r   )r   r   r   r  r}   rl   )r   rF  r}   r   )r   r  ExternKernelSchedulerNoder   r<   r*   r  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_stater   rX   
get_rangesr   collectionsr   r   r   r   r   r   ry  r   r   r   rY   r   r  graph_inputs)rh   r/  r0  buf_accessesr   r   ry  r  buf_byte_accessesr  r   	buf_bytesri   )rP  rI  rB  rD  rh   r=  rj   r9    st   




%
z0BaseSchedulerNode.get_read_write_buffer_accesses
int | Nonec                 C  sv   | j d u rd S | j  }|d u rd S t|}|d u rd S t|tjr&|j j}tjj	j
|dd}td d  |7  < |S )Nr   r   inductor
flop_count)r   get_origin_noder8   r   r  SymIntexprrY   r   r   r   r   )rh   fx_nodeflopsresolved_flopsri   ri   rj   estimate_flops  s   

z BaseSchedulerNode.estimate_flopsfloatc                 C  s   | j d ur| j S |  S rt   )rS  _get_estimated_runtimerg   ri   ri   rj   get_estimated_runtime  s   
z'BaseSchedulerNode.get_estimated_runtimec              
   C  s  |   d  d }|j }tt|sdS t| jrt| jtj	s%J z:t
jrZt| }t }||}|durCt|ts@J |W S t| }|du rPt| j}|j||d |W S t| jW S  tyw } zt| W Y d}~dS d}~w ty } zt| W Y d}~dS d}~ww t| jrdS t| }|dur|S |j }	z!t }
t|	d }|
dkrtd|
 |dkrtd| W n
 ty   Y dS w |  }|dks|du r|  |
 }|d }|S d}|  }|du rdn|}|| | d	 }||
 }t ||}|d }|S )
zC
        Returns estimated op runtime in milliseconds (ms)
        r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r?  )!r   rd  r   r"  rT   r:   rR   r   r*   IRNoder(   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookuprf  r1   r0   	set_value
ValueErrorr|  r  	TypeErrorrW    maybe_estimate_runtime_benchmarkmaybe_get_dtyperN   rL   AssertionErrorr{  re  r3  max)rh   r   r  	cache_keycache	cache_valmsr   retdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_timeri   ri   rj   rg    sz   








z(BaseSchedulerNode._get_estimated_runtimeir.TemplateBuffer | Nonec                 C  s   d S rt   ri   rg   ri   ri   rj   get_template_node  r  z#BaseSchedulerNode.get_template_nodeir.TemplateBufferc                 C  s   |   }|d us
J |S rt   r  )rh   templateri   ri   rj   get_template_node_or_throw  s   z,BaseSchedulerNode.get_template_node_or_thrownodeslist[BaseSchedulerNode]Jtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]c                 C  sD   t dd t| D }| d| }| | }| |d d }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c                 s  s     | ]\}}|  r|V  qd S rt   r  r   ir  ri   ri   rj   r     s    zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>Nr   )next	enumerate)r  template_indexprologuetemplate_nodeepilogueri   ri   rj   get_prologue_template_epilogue  s
   
z0BaseSchedulerNode.get_prologue_template_epilogue)r  r   r}   r  )r   r]  r}   r  r;  )r}   r   r=  r  r3   r  r3   r}   rl   r  rK  r}   r  )r   r2   r}   r  r>  )r  rT  r}   r  r  rD  r  rK  r}   r  r}   rD  r  r  r}   r  r}   r  )r}   r  )r  r   r}   r   r@  r  r  r}   rl   T)r  rQ   r  rl   r}   r  r<  )r/  rl   r0  rl   r}   r   )r/  rl   r0  rl   r}   r:  r}   r\  r}   rf  r}   r  )r}   r  )r  r  r}   r  )>ru   rv   rw   rx   r   rS  rW  r\  rf  ri  r  rz  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  rH   r   r  r  r,   r   rd  r  r   r  rT   r   r  r  r  r  r  r  r  r  r-  r3  r6  r7  r2  r9  re  rh  rg  r  r  r   r  ri   ri   ri   rj   r^     s   
 




































 /


 

W
r}   $torch._inductor.codecache.LocalCachec                   C  s   t jj S rt   )r  r  	codecache
LocalCacheri   ri   ri   rj   rn  &     rn  snoder   c                   s|   t | jdd}| jj}| jg || jj| jj}| jj}t||f\}}d	dd t|ft	 fdd|D  }|S )
Npython_kernel_namer  r}   rl   c                 S  s    t | tjot | tjtjf S rt   )r   r*   rk  GeneratorStateOpaqueObjectStater  ri   ri   rj   _is_tensor_ir5  s   z@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_irc                 3  s(    | ]} |rt | nd V  qd S rt   )r   r  r   ar  ri   rj   r   <  s   & z<get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>r>  )
rx  r   inputsfill_non_provided_argsconstant_argsrX  pytreetree_flattenr   r   )r  r  r  rX  	flat_argsflat_args_pytree_specrw  ri   r  rj   rm  +  s   
rm  Callable[[Any], Any] | Nonec                 C  s`   t | tsd S tjjjtjjjtjjjd}t| j	dd}||vr#d S t | j	t
js,d S || S )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr  r  )r   rQ  r  opsatenmmbmmaddmmrx  r   r*   ExternKernel)r  mms_fnsr  ri   ri   rj   _get_mm_like_fnA  s   
r  rR  c                   s   d }d }t jrt }|d u rd S |} fdd}nd S t }t }||}|d ur6t|ts4J |S ddlm	 | \}}ddl
m}	 |	j|||dddd	}
|j||
d
 |
S )Nc                     s    S rt   ri   ri   r  snode_args_kwargsri   rj   rY  Z      z2maybe_estimate_runtime_benchmark.<locals>.<lambda>r   )r  r   )benchmarker   
   )memory_warmup_itersbenchmark_itersmax_benchmark_durationri  )r'   !runtime_estimations_mms_benchmarkr  rm  rn  ro  r   rf  utilsr  $torch._inductor.runtime.benchmarkingr  	benchmarkrp  )r  bench_fnargs_kwargs_fnmm_fnrw  rx  ry  r  rX  r  rz  ri   r  rj   rs  Q  s8   

	rs  T)slotsc                   @  sL   e Zd ZU ded< ded< ded< ded< dddZdddZdddZdS )	WhyNoFuser   name1name2reasonztuple[Any, ...]r  r{   r^   r|   r}   r  c                 C  s   |  | _|  | _d S rt   )r  r  r  rh   r{   r|   ri   ri   rj   r\  ~  s   
zWhyNoFuse.__init__r   c                 G  s   || _ || _t|  d S rt   )r  r  
fusion_logdebug)rh   r  r  ri   ri   rj   __call__  s   zWhyNoFuse.__call__c                 C  s"   d| j  d| j d| j| j  S )Nzcannot fuse z with r  )r  r  r  r  rg   ri   ri   rj   __str__  s   
zWhyNoFuse.__str__Nr{   r^   r|   r^   r}   r  )r  r   r  r   r}   r  r;  )ru   rv   rw   rx   r\  r  r  ri   ri   ri   rj   r  w  s   
 

r  objr   c                 C  sF   t | ttfrt| td} tj| dd}d|v r!dt|d S |S )Nkey   )r  ro      )	r   r   setsortedr   pprintr  textwrapr  )r  r  ri   ri   rj   r    s   r  c                   @  s8   e Zd ZdddZddd	ZdddZdddZeZdS )r0  r   r4   r}   r  c                 C  s   t |g| _d S rt   r  r  ri   ri   rj   r\       zOutputNode.__init__rl   c                 C  r  r  ri   rg   ri   ri   rj   r     r  zOutputNode.is_reductionr6  c                 C  r  )Nri   ri   rg   ri   ri   rj   r     r  z'OutputNode.get_inputs_that_alias_outputr   c                 C  r  )NOUTPUTri   rg   ri   ri   rj   r    r  zOutputNode.get_nameN)r   r4   r}   r  r>  r?  r;  )ru   rv   rw   r\  r   r   r  ri  ri   ri   ri   rj   r0    s    



r0  r   r  r  r*  rP  r  c                   s   t  jD ]}t|ts! |j  }|    d7  < qd fddtfdd	jD }|rKj| _	j
| d
S d
S )am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   r   r2   r}   rl   c                   sX   t | tr* | j  }|   dkoj| | }| k}|p)|S dS )Nr   F)r   r5   r   r
  r  r  fusable_weak_dep)r   r  is_redundantis_self_dep)r*  name_to_dep_countr  r   ri   rj   r    s   


z+_prune_redundant_deps.<locals>.should_prunec                 3  r  rt   ri   r   r  ri   rj   r     r  z(_prune_redundant_deps.<locals>.<genexpr>Nr  )rW  r   rV  r   r5   r   r
  r  r   r  r   r  )r   r  r*  r   r  deps_to_pruneri   )r*  r  r  r   r  rj   r    s   

r  c                      sP   e Zd Zd fddZdd
dZdddZdddZdddZdddZ  Z	S )rQ  r  r   r   r]  r}   r  c                   sp   t  | | | | |  t|tjr4| r6t	
|jd j}d}| }|||ff| _d S d S d S Nr   r   )superr\  rf  r  get_read_writesr   r*   UserDefinedTritonKernelcan_fuse_epiloguemathr   mutable_argsshapeget_device_or_errorr   )rh   r  r   numelr   r  	__class__ri   rj   r\    s   
z"ExternKernelSchedulerNode.__init__r   c                 C  s   |    dt| jdd  S )Nz.node.kernel = r  )r  rx  r   rg   ri   ri   rj   rz    s   z)ExternKernelSchedulerNode.debug_str_extrarl   c                 C  r  NTri   rg   ri   ri   rj   r    r  z#ExternKernelSchedulerNode.is_externc                 C  s$   | j d usJ t| j do| j  S )Nr  )r   r&  r  rg   ri   ri   rj   r    s   z*ExternKernelSchedulerNode.has_side_effectsSequence[Sequence[sympy.Expr]]c                 C  s>   t | jtjr| j rt| jjd j}|gg fS g g fS r   )	r   r   r*   r  r  r  r   r  r  )rh   r  ri   ri   rj   rV    s   
z$ExternKernelSchedulerNode.get_rangeswrapperr   c                 C  s   t | jtjs	J | j|S rt   )r   r   r*   r  r  )rh   r  ri   ri   rj   r    s   z!ExternKernelSchedulerNode.codegenr  r   r   r]  r}   r  r;  r>  r}   r  r  r   r}   r  )
ru   rv   rw   r\  rz  r  r  rV  r  __classcell__ri   ri   r  rj   rQ    s    



	rQ  c                      s   e Zd Zd	 fddZ  ZS )
r  r  r   r   r]  r}   r  c                   s(   t  | | | | |  d S rt   )r  r\  rf  r  r  rh   r  r   r  ri   rj   r\    s   
zNopKernelSchedulerNode.__init__r  )ru   rv   rw   r\  r  ri   ri   r  rj   r    s    r  c                      s\  e Zd ZU dZded< ded< d` fddZ		dadbddZ		dadcddZddddZdedd Z	dfd!d"Z
dgd$d%Zdfd&d'Zdhd+d,Zdfd-d.Zdid2d3Zdjd5d6Zdkd8d9Zdld:d;Zdld<d=Zdld>d?Zdld@dAZdmdCdDZdndGdHZdodJdKZdpdLdMZ	NdqdrdQdRZedsdSdTZedsdUdVZdtdYdZZedud\d]Zedl fd^d_Z   Z!S )vr   zu
    A SchedulerNode is a node for scheduling that encapsulates either
    a ComputedBuffer or a TemplateBuffer.
    z tuple[Sequence[sympy.Expr], ...]_sizesr?   r   r  r   r   %ir.ComputedBuffer | ir.TemplateBufferr}   r  c                   s"   t  | | | |   d S rt   )r  r\  rf  _compute_attrsr  r  ri   rj   r\  	  s   
zSchedulerNode.__init__Nextra_indexing_constraints'tuple[dict[Any, Any], list[Any]] | Nonerecompute_sizes_body_funcCallable[_P, _T] | Nonec                 C  s   t | jtjtjfsJ | jj||d\| _}|| _| j }| j	
|j}||| jf| _tj p7t|j }t | jtjrK| | jj|d d S | tj| jg| jR d|i d S )Nr  r  )	normalizer  )r   r   r*   r   TemplateBuffersimplify_and_reorderr  r   r  r  get_backendgroup_fnr   r'   loop_ordering_after_fusionrT   r   r  extract_read_writesr)   )rh   r  r  bodyr  r  should_normalizeri   ri   rj   r    s2   

zSchedulerNode._compute_attrsCallable[..., Any] | Nonec                 C  sJ   t dd | jjD }| j||d |r#| | j|| j d S d S )Nc                 s  "    | ]}t |ttfr|V  qd S rt   r   r5   r4   r   ri   ri   rj   r   8      
z8SchedulerNode.recompute_size_and_body.<locals>.<genexpr>r  )r   r   r   r  r  r  r  rL  )rh   r  r  	fake_depsri   ri   rj   recompute_size_and_body3  s   z%SchedulerNode.recompute_size_and_bodyr  rl   need_clear_tiling_cachec                 C  st   t dd | jjD }| tj| jg| jR d|i|	| j
 | j|  |r8ddlm} |j  d S d S )Nc                 s  r  rt   r  r   ri   ri   rj   r   I  r  z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>r  r   SIMDScheduling)r   r   r   r  r)   r  r   r  r  r  rL  pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)rh   r  r  r  r  ri   ri   rj   refresh_dependenciesD  s&   z"SchedulerNode.refresh_dependencies	new_orderSequence[int]c                 C  s*   | j || _ | j j| _| jddd d S )NFTr  r  )r   reorder_iter_loopssizesr  r  )rh   r  ri   ri   rj   apply_new_loop_orderb  s
   
z"SchedulerNode.apply_new_loop_orderc                 C  s   | j  }t| j j| }tt|}tt||| }| ||  t| jd dks.J | jd | jd d | jd d ff| _d S )Nr   r   r   )r   get_original_num_rdimsr   	iter_varsr   ranger$  r   )rh   	num_rdims
num_pwdimspwdimsrdimsri   ri   rj   swap_pw_red_dimensionj  s   
,z#SchedulerNode.swap_pw_red_dimensionr^   c                 C  s   | j  | _ | S rt   )r   extract_pw_from_reductionrg   ri   ri   rj   r-  t  s   z'SchedulerNode.extract_pw_from_reductionc                 C  sX   t | sd S t| jtjsJ | j  |   W d    d S 1 s%w   Y  d S rt   )r   r   r   r   r*   r   with_original_inner_fnr  rg   ri   ri   rj   cancel_reduction_splitx  s   

"z$SchedulerNode.cancel_reduction_split	dimensionr   	new_rangec                 C  sl   t | jtjtjfsJ | j||| _| jj| _| j	 }| j
|j}||| jf| _| jddd d S )NTr!  )r   r   r*   r   r  r   #expand_dimension_for_pointwise_noder#  r  r  r  r
  r  r   r  )rh   r0  r1  r  r  ri   ri   rj   r2    s   

z1SchedulerNode.expand_dimension_for_pointwise_nodec                 C  s(   | j  | _ | j j| _| jddd d S )NTFr!  )r   merge_loopsr#  r  r  rg   ri   ri   rj   r3    s   
zSchedulerNode.merge_loopsr  r3   r  c                 C  s~   d }| j d }t||j  kr|jkrn n||}|r5t jd7  _td|  | | 	| dS td|   dS )Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
r  r   num_varsdecide_loop_order_to_matchr+   num_loop_reorderingloop_ordering_logr  r  r$  )rh   r  r  r  
self_sizesri   ri   rj   r    s    
 


z'SchedulerNode.reorder_loops_by_dep_pairr   c                 C  s   |   }| d| jd  | d| jd  | d| j g}| j D ]#}t|tsG|j}tj	
|}t|tjsG|| dt|j  q$t| jtrc|d| d |t| j d	 | jd usjJ ||   d
|S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  ro  )r  r   r  r   r  r   r5   r   rY   r   rG  r*   rJ  r   r  r  r   r?   r  r  r  r   r  r  join)rh   r   linesr   r  r   ri   ri   rj   rz    s$   

zSchedulerNode.debug_str_extrar  c                 C  r  rt   )r  rg   ri   ri   rj   rV    r  zSchedulerNode.get_rangesc                 C  sJ   t | jtjtjfsJ dt| jt| j o$| jd u p$| jj	 S Ntype(self.node)=)
r   r   r*   r   r  r   rl   r   r   has_partial_accumulaterg   ri   ri   rj   r     s   zSchedulerNode.is_reductionc                 C  s0   t | jtjsJ dt| j| j dkS )Nr<  dot)r   r   r*   r   r   r   rg   ri   ri   rj   r    s   "zSchedulerNode.is_native_matmulc                 C  sF   t | jtjtjfsJ dt| jt | jtjo"t | jjtjS r;  )r   r   r*   r   r  r   r   	SplitScanrg   ri   ri   rj   r    s   
zSchedulerNode.is_split_scanc                 C  s   t | jtjS rt   r   r   r*   r  rg   ri   ri   rj   r    r:  zSchedulerNode.is_templater  c                 C  s   t | jtjr
| jS d S rt   r@  rg   ri   ri   rj   r       zSchedulerNode.get_template_node
index_varsSequence[sympy.Expr]c                 G  s   |    |   | | d S rt   )r  r  r  )rh   rB  ri   ri   rj   run  s   zSchedulerNode.rundict[sympy.Expr, sympy.Expr]c                 C  sH   | j }ttt|ttt|ksJ tttj|tj|}|S rt   )	r  r   mapr   dictzipr  r  from_iterable)rh   rB  r#  r   ri   ri   rj   ranges_from_index_vars  s    

z$SchedulerNode.ranges_from_index_varsc              	   C  s   |  |}zCttt |. tj|  | j|  W d   n1 s'w   Y  W d   W dS W d   W dS 1 sAw   Y  W dS  tyW   t	
d| j  w )a  
        Generate code for this node using the provided index variables.

        This method sets up the appropriate context for code generation, including
        simplifying indexing expressions based on the variable ranges, and then
        calls the node's body function with the index variables.

        Args:
            index_vars: A sequence of sequences of sympy expressions representing
                        the index variables for each dimension of the computation.
        NzError in codegen for %s)rJ  rY   set_ops_handlerrF   get_ops_handlerr'  set_current_noder   r{  r|  fatalr   )rh   rB  r   ri   ri   rj   r    s   

VzSchedulerNode.codegenT	pointwiserT  c                 C  s:   |r| j nt| j \}}tj| j|tjjgt| gdS )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	r  r   r)   r  r   r   SZeror   )rh   rO  
keep_sizesignore_sizesri   ri   rj   "pointwise_or_reduction_read_writes  s   z0SchedulerNode.pointwise_or_reduction_read_writesc                 C     | j ddS )zH
        Get the memory dependencies in the non-reduction axes.
        TrO  rU  rg   ri   ri   rj   r       z#SchedulerNode.pointwise_read_writesc                 C  rV  )zD
        Get the memory dependencies in the reduction axes.
        FrW  rX  rg   ri   ri   rj   reduction_read_writes&  rY  z#SchedulerNode.reduction_read_writesr  r  c                 C  s   |   rdS tdd |  D rdS t| jjdkrDt|tjrDt	t
| jj}t|tjs8J dt||j|jkoC|j|jkS dS )NFc                 s  r  rt   )r  r  ri   ri   rj   r   0  r  z,SchedulerNode.can_inplace.<locals>.<genexpr>r   ztype(write_dep)=)r  r   rd  r   r   ry  r   r)   r3   r  iterr   r   r   )rh   r  	write_depri   ri   rj   r  -  s   zSchedulerNode.can_inplacerD  c                 C  s   t  }t| jtrP| j D ]A}|jdkrO|jdkrOd|jv r&|jd dks4t|j	dkrO|j	d dkrO|
d|jv r@|jd nt|j	dkrL|j	d	 nd
 q|S )Ncall_methodstoremode
atomic_addr  r  r   r   r   r  )r   r   r   r?   r   r	  r$  rX  r   r  r  )rh   buffers_store_as_atomic_addr   ri   ri   rj   _get_atomic_add_buffers:  s   



z%SchedulerNode._get_atomic_add_buffersc                   s$   | j d ur| j drdS t  S )Ndevice_assert_asyncT)r   has_opr  r  rg   r  ri   rj   r  N  s   
zSchedulerNode.has_side_effects)r  r   r   r   r}   r  NN)r  r  r  r  r}   r  )r  r  r  r  r}   r  )r  rl   r  rl   r}   r  )r  r   r}   r  r=  r}   r^   )r0  r   r1  r   r}   r  r  r;  r  r>  r  )rB  rC  r}   r  )rB  r  r}   rE  )rB  r  r}   r  r  )rO  rl   r}   rT  )r}   rT  r  r  )"ru   rv   rw   r   rx   r\  r  r  r  r$  r,  r-  r/  r2  r3  r  rz  rV  r   r  r  r  r  rD  rJ  r  rU  rH   r  rZ  r  rb  r  r  ri   ri   r  rj   r      sP   
 #



















r   group_snode)FusedSchedulerNode | GroupedSchedulerNodec                   sV    j } tjdd |D  t fddtjdd |D  D  jj  _	d S )Nc                 S     g | ]}|j qS ri   r   r  ri   ri   rj   r   [      z3refresh_group_node_dependencies.<locals>.<listcomp>c                 3  "    | ]}|j   vr|V  qd S rt   r   r  r   rg  ri   rj   r   _  r  z2refresh_group_node_dependencies.<locals>.<genexpr>c                 S  ri  ri   )rV  r  ri   ri   rj   r   a  rk  )
r   r  r)   
ReadWrites
merge_listr   unionr   ry  rV  )rg  r   ri   rn  rj   refresh_group_node_dependenciesV  s   rr  r  r   r   r  c                 C  s   t | ttfs	J || _|| _d | _tjdd |D  | _t	|  t
dd | jD | _tdd | jD | _t
dd | jD | _tdd | jD | _dd	 |  D | _d S )
Nc                 S  s   g | ]
}|j d ur|j qS rt   r   r  ri   ri   rj   r   r  r   z#init_group_node.<locals>.<listcomp>c                 s  r  rt   rH  r  ri   ri   rj   r   w  r  z"init_group_node.<locals>.<genexpr>c                 s  r  rt   )rI  r  ri   ri   rj   r   x  r  c                 s  r  rt   )rF  r  ri   ri   rj   r   y      
c                 s  r  rt   )rG  r  ri   ri   rj   r   |  ru  c                 S  r_  ri   r`  ra  ri   ri   rj   rb        
z#init_group_node.<locals>.<dictcomp>)r   r   GroupedSchedulerNoder   r  r   r   rq  r   rr  rL  rH  rv  rI  rF  rG  rd  rQ  )rg  r  r   ri   ri   rj   init_group_nodeh  s&   

rx  c                      s  e Zd ZU dZded< edXdd	ZdYd
dZdZddZe	d[ddZ
d\ddZd] fddZe	d^ddZd^dd Ze	d_d"d#Zd`d%d&Zd^d'd(Zd^d)d*Zda fd.d/Ze	d_d0d1Ze	d_d2d3Zdbd5d6Zd^d7d8Ze	dcd9d:Ze	dcd;d<Ze	dcd=d>Ze	dcd?d@Ze	dddBdCZdedEdFZe	dcdGdHZdfdJdKZdgdNdOZ dhdRdSZ!d^dTdUZ"e	dc fdVdWZ#  Z$S )ir   z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r  r   r{   r^   r|   r}   c                 C  s~   |j |j u sJ t|ttfsJ | r$t|tr$t|jtjs#J n	t|ttfs-J t	t
| | }| |j |S rt   )r  r   r   r   r  rQ  r   r*   r<   r   r  r  r   )rn   r{   r|   r  ri   ri   rj   ro     s   zFusedSchedulerNode.fusec                 C  s2   | j D ]}t|tsJ | sJ |  q| S rt   )r   r   r   r   r-  rh   r   ri   ri   rj   r-    s
   

z,FusedSchedulerNode.extract_pw_from_reductionr  c                 C  s&   | j D ]}t|tsJ |  qd S rt   )r   r   r   r,  ry  ri   ri   rj   r,    s   

z(FusedSchedulerNode.swap_pw_red_dimensionr\  c                 C  8   t td dd |  D }t|dkrd S t|}|S )Nc                 s  (    | ]}|  s| r| V  qd S rt   r  r  re  r  ri   ri   rj   r         
z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>r   r   filterr   r   r   rh   fpsr{  ri   ri   rj   re       
z!FusedSchedulerNode.estimate_flopsr  r3   r  rl   c                 C  s  |   rdS d}| jD ]&}t|ts dS |dur,t|t|jd kr,td  dS |jd }qd}|dus:J t||j	  krH|j	krOn n|
|}|s[td|   dS t jd7  _td|  | | jD ]}t|tswJ || qnt|  dS )	z@
        Return true if a loop reordering is performed.
        FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)r  r   r   r   r   r  r7  r  r   r4  r5  r  r+   r6  r$  rr  )rh   r  r  r8  r  r  ri   ri   rj   r    s>   

 


z,FusedSchedulerNode.reorder_loops_by_dep_pairr  r   c                   s6   t  | t| || g | _t|dd dj| _d S )Nc                 S  s   t |  S rt   )r   r   r  ri   ri   rj   rY        z-FusedSchedulerNode.__init__.<locals>.<lambda>r  )r  r\  rx  r  rv  r   )rh   r  r   r  ri   rj   r\    s   zFusedSchedulerNode.__init__r   c                 C     d dd | jD S )N_c                 S     g | ]}|  qS ri   r`  r  ri   ri   rj   r         z/FusedSchedulerNode.get_name.<locals>.<listcomp>r9  r   rg   ri   ri   rj   r    r  zFusedSchedulerNode.get_namec                 C     | j d  S r   r   r  rg   ri   ri   rj   r    r:  z!FusedSchedulerNode.get_first_namerD  c                 C     t jdd | jD  S )Nc                 S  r  ri   r  r  ri   ri   rj   r     r  z7FusedSchedulerNode.get_buffer_names.<locals>.<listcomp>r   rq  r   rg   ri   ri   rj   r    r  z#FusedSchedulerNode.get_buffer_namesrN  c                 C  "   g }| j D ]	}||  q|S rt   r   r  rd  rh   r  r   ri   ri   rj   rd       
zFusedSchedulerNode.get_outputsc                   sP    fddt  jD } jd j}|d ur|   td| dS )Nc                   s,   g | ]\}}    d | d|  qS )z.snodes[z] =
)r  r  )r   r  r   rg   ri   rj   r     s    z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>r   ro  r  )	r  r   r   r  r  r  r  r9  r~  )rh   r:  r   ri   rg   rj   rz    s   
z"FusedSchedulerNode.debug_str_extrac                 C  s   dd | j D }|  d| S )Nc                 S  r  ri   )r  r  ri   ri   rj   r     r  z6FusedSchedulerNode.debug_str_short.<locals>.<listcomp>z
, snodes: rC  )rh   
snodes_strri   ri   rj   r    s   z"FusedSchedulerNode.debug_str_shortr  r  rK  c                   s@   t  || t }t| jD ]}||| ||j qd S rt   )r  r  r   r   r   updaterE  )rh   r  r  r   r  ri   rj   r  	  s   z!FusedSchedulerNode.set_last_usagec                 C  r  )Nc                 S  r  ri   r   r  ri   ri   rj   r     r  z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>r  rg   ri   ri   rj   r     r  z$FusedSchedulerNode.used_buffer_namesc                 C  r  )Nc                 S  r  ri   )r  r  ri   ri   rj   r     r  zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r  rg   ri   ri   rj   r    s   z/FusedSchedulerNode.used_or_aliased_buffer_namesr  c                 C  r  rt   rC  rg   ri   ri   rj   r      r  zFusedSchedulerNode.get_nodesc                 C  s   t | j d|   dS )Nz(nodes=rg  rh  rg   ri   ri   rj   ri  #  rj  zFusedSchedulerNode.__repr__c                 C  r  )Nc                 s  r  rt   r   r  ri   ri   rj   r   (  r  z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>r   r   rg   ri   ri   rj   r   &  r  zFusedSchedulerNode.is_reductionc                 C  r  )Nc                 s  r  rt   )r  r  ri   ri   rj   r   ,  r  z6FusedSchedulerNode.is_native_matmul.<locals>.<genexpr>r  rg   ri   ri   rj   r  *  r  z#FusedSchedulerNode.is_native_matmulc                 C  r  )Nc                 s  r  rt   )r  r  ri   ri   rj   r   0  r  z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>r  rg   ri   ri   rj   r  .  r  z FusedSchedulerNode.is_split_scanc                 C  r  )Nc                 s  r  rt   r  r  ri   ri   rj   r   4  r  z1FusedSchedulerNode.is_template.<locals>.<genexpr>r  rg   ri   ri   rj   r  2  r  zFusedSchedulerNode.is_templater  c                 C  s$   | j D ]}| r|   S qd S rt   )r   r  r  re  ri   ri   rj   r  6  s
   
z$FusedSchedulerNode.get_template_nodetorch.devicec                 C  s
   | j d S r   )r   rg   ri   ri   rj   r   =  r  zFusedSchedulerNode.get_devicec                 C  r  )Nc                 s  r  rt   )r  r  ri   ri   rj   r   B  r  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>r  rg   ri   ri   rj   r  @  r  z+FusedSchedulerNode.has_aliasing_or_mutationr  c                 C     t rt   NotImplementedErrorr  ri   ri   rj   r  F  r  z'FusedSchedulerNode.update_mutated_namesr   r2   c                 C  r  rt   r  )rh   r   ri   ri   rj   r  I  r  zFusedSchedulerNode.add_fake_depr  r  c                 C  r  rt   r  r  ri   ri   rj   r  L  r  zFusedSchedulerNode.can_inplacec                 C  s6  |   }ddd | jD }t }|| dt| j d| d| dt| jj	 d| d	t| j
 d| d
t| jj| j
  d| d| j d| d| j d| d |  |  D ]	}||  q]W d   n1 sqw   Y  |d z	||   W n ty   tjddd Y nw |  S )rk  r  c                 s  s    | ]}t |jV  qd S rt   )r   ru   r  ri   ri   rj   r   R      z/FusedSchedulerNode.debug_str.<locals>.<genexpr>r  rl  rm  rn  ro  rp  rq  rr  rs  z.outputs = [
            Nr  rt  Tru  )r  r9  r   rQ   rw  r   ru   r  r   ry  rV  r   rF  rG  r  rd  r  r  rz  r{  r|  r}  r  r~  )rh   r   node_typestrr   r   ri   ri   rj   r  O  sZ   


zFusedSchedulerNode.debug_strc                   s(   | j d urtdd | j D S t  S )Nc                 s  r  rt   )r  r  ri   ri   rj   r   n  r  z6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>)r   r   r  r  rg   r  ri   rj   r  k  s   

z#FusedSchedulerNode.has_side_effectsr{   r^   r|   r^   r}   r   rf  r=  r  r  )r  r   r   r  r}   r  r;  r  r}   rN  r  r  r>  r  )r}   r  r  )r   r2   r}   r  r  )%ru   rv   rw   r   rx   ry   ro   r-  r,  rH   re  r  r\  r  r  r  rd  rz  r  r  r   r  r   ri  r   r  r  r  r  r   r  r  r  r  r  r  r  ri   ri   r  rj   r     sZ   
 


+










r   c                      s<   e Zd Zd fddZdd
dZdddZdddZ  ZS )FusedMixOrderReductionsr{   r^   r|   r}   r  c                   sd   t |st |sJ ||}}|| _|| _t |jt| t|   t 	| j| _
d S rt   )r   r   r{   r|   r  r\  r  r   r   r   r  r  r  ri   rj   r\  s  s   

z FusedMixOrderReductions.__init__other_nodestuple[BaseSchedulerNode, ...]c                 C  s   t |trJ t |trJ | jj||ddsdS t|r%t|s%dS ddd}dd	d
}|rG|||f||@ sE|||||f@ rGdS |  p[tt	| jj
||dd| jkS )a  
        node1 is from the current mix order reduction; node2 is another node we want to fuse in.

        other_nodes are passed in to check if fusion will introduce producer/consumer relationship
        between the inner and outer reduction. If yes, we don't fuse.
        Fallow_mix_order_reductionr  r  r}   rD  c                 S     t  }|jdd | D  S )Nc                 s  r  rt   rs  r  ri   ri   rj   r     r  zTFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors.<locals>.<genexpr>r   rq  r  r   ri   ri   rj   _get_ancestors  s   zAFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestorsc                 S  r  )Nc                 s  r  rt   )r   r  ri   ri   rj   r     r  zZFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names.<locals>.<genexpr>r  r  ri   ri   rj   _get_operation_names  s   zGFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names)count_bytesN)r  r  r}   rD  )r   r  r  r   r   r   r   typingcastr   score_fusion_memoryr  )rh   r{   r|   r  r  r  ri   ri   rj   sub_node_can_fuse  s0   


z)FusedMixOrderReductions.sub_node_can_fuseotherc                 C  s`   t |ts| | j|| jfp| | j|| jfS | | j|j| j|jfo/| | j|jt S rt   )r   r  r  r{   r|   r   rh   r  ri   ri   rj   can_fuse_with  s   
z%FusedMixOrderReductions.can_fuse_withc                 C  s   | j  }| j|}t|tr%|| j |j }|| j|j}t||S | | j || jfr<|| j |}t|| jS || j|}t| j |S rt   )	r{   r   r  r
  r   r  ro   r|   r  )rh   r  r  backendfused_node1fused_node2r  ri   ri   rj   	fuse_with  s   


z!FusedMixOrderReductions.fuse_withr  )r{   r^   r|   r^   r  r  )r  r^   )ru   rv   rw   r\  r  r  r  r  ri   ri   r  rj   r  r  s
    

4r  c                      sJ   e Zd Zd fd	d
ZedddZdddZdddZdddZ  Z	S )$FusedExternTritonKernelSchedulerNoder  r   kernel_noderQ  fused_epiloguer   r}   r  c                   sV   t |jtjs	J ttt ||g}t 	|| || _
|| _| j
j| _|j| _d S rt   )r   r   r*   r  r  r  r   r^   r  r\  r  r  rH  rO  )rh   r  r  r  r   r  ri   rj   r\    s   
z-FusedExternTritonKernelSchedulerNode.__init__r{   r|   r   c                 C  sJ   |j }t|jdksJ |jtt|jj }|jt	| t
|||S Nr   )r  r   rV  r*  r  r[  r   r  removeNodeUserr  )rn   r{   r|   r  original_mutated_bufferri   ri   rj   epilogue_fuse  s   z2FusedExternTritonKernelSchedulerNode.epilogue_fuser  r   c                 C  s   t | jjtjs
J t | jjtjsJ | jj sJ t	| jjj
d j}ddlm} || jg|\}}ddlm} || jg|}ddlm} |||| }	|	 }
| jj|| jj|
fS )Nr   r  )SIMDKernelFeatures)FusedUserDefinedTritonKernel)r   r  r   r*   r   r  r  r  r  r   r  r  torch._inductor.codegen.simdr  get_tiling_and_scores,torch._inductor.codegen.simd_kernel_featuresr  torch._inductor.codegen.tritonr  r  codegen_with_epilogue_fusion)rh   r  r  r  tilingr  r  kernel_featuresr  fused_user_kernelnew_kernel_srcri   ri   rj   r    s   z,FusedExternTritonKernelSchedulerNode.codegenrl   c                 C  r  r  ri   rg   ri   ri   rj   r  	  r  z.FusedExternTritonKernelSchedulerNode.is_externr  c                 C  r  rt   )r  rV  rg   ri   ri   rj   rV  		  r  z/FusedExternTritonKernelSchedulerNode.get_ranges)r  r   r  rQ  r  r   r}   r  )r{   rQ  r|   r   r}   r   r  r>  r  )
ru   rv   rw   r\  ry   r  r  r  rV  r  ri   ri   r  rj   r    s    

r  c                      s   e Zd ZU dZd<ddZd=d	d
Zed>ddZed?ddZ			d@dA fddZ	edBddZ
edCd!d"ZeZd#ed$< edDd&d'ZedCd(d)ZdEd*d+ZdEd,d-ZdFd.d/ZdGd0d1ZdHd3d4ZdId6d7ZdJd:d;Z  ZS )KForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    producerr^   r}   r  c                 C  s2   |  D ]}| | jv r| j|    S qd S rt   )rd  r  read_to_node)rh   r  r   ri   ri   rj   get_consumer_subnode_for	  s
   z3ForeachKernelSchedulerNode.get_consumer_subnode_forconsumerc                 C  sp   t t  }|jjD ] }|j| jjvrq	| jj|j  }|| jv r)|	| j|  q	t
|dkr6tt|S d S r  )r   r^   r   r   r   r  r*  r
  name_to_noder  r   r  r[  )rh   r  	producersrd	node_nameri   ri   rj   get_producer_subnode_for	  s   

z3ForeachKernelSchedulerNode.get_producer_subnode_forrl   c                   s&  t  |}  r;| r;tt  tt|}t jt|jk}|s)|d |o:t fddt j|jD S | re 	 rI|d dS tt|}|
 }|d ur_|j |S |d dS   r|	 rs|d dS tt   |}|d ur j||S |d dS td	)
Nzforeach do not have same lengthc                 3  s"    | ]\}} j ||V  qd S rt   )r  r   r   lrr  ri   rj   r   8	  s
    
z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r  r  r  r  r   r   r   rH  r   r  r  r   r  ru  )rn   r  r  whyforeach_matchconsumer_subnodeproducer_subnoderi   r  rj   r   /	  sJ   


z#ForeachKernelSchedulerNode.can_fusec                 C  s  |  s
|  s
J |  rtt|}|j}|j}ntt|}|j}|j}d }d }|  rL|  rLtt|}tt|}dd t|j|jD }nj|  rtt|}||}g }|}d }|jD ]}	|	|u rxt	
|	|}
|
}||
 qd||	 qdn7|  rtt|}||}g }|}d }|jD ]}	|	|u rt	
||	}
|
}||
 q||	 qntd| |j|||||dS )Nc                 S  s   g | ]
\}}t ||qS ri   )r   ro   r  ri   ri   rj   r   q	  s    
z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>zTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  r  r  r  r  r  rH  r   r  r   ro   r   r  ru  r  )rn   r  r  r  r  r  r  fused_nodesr  r   new_noder  ri   ri   rj   ro   ^	  sj   



zForeachKernelSchedulerNode.fuseNFr  r   r   r  r  r  r  r  r  c                   s  i  _ i  _|d u s|d u r4t || |D ]}|jjD ]}| j |j< q| D ]}	| j|	< q*qn| _| _	d  _
g  _ tj|j|jg t fddt|j|jD  jj  _t|j|jg _t|j|jg _t|j|j _t|j|j _| rt|tsJ ||}
}nt|tsJ ||}
}|
j _ j|j |
j _| D ]}	| j|	< qdd  j	D  _| _ |d ! }|sJ |t"#dfff _$tt%j&j'   _(| _)d S )Nc                 3  rl  rt   rm  r   rg   ri   rj   r   	  s    z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>c                 S  s&   i | ]}|j  D ]\}}||q	qS ri   )rQ  items)r   r  r  vri   ri   rj   rb  	  s
    

z7ForeachKernelSchedulerNode.__init__.<locals>.<dictcomp>r   combo_kernel)*r  r  r  r\  r   r   r   r   r  r   r   r  r  r)   ro  rp  r   rq  rV  ry  rL  rH  rv  rI  rF  rG  r  r   r  r   r  rQ  r  r   r   Exprr   r  fxNoder)  r  )rh   r  r   r  r  r  r  r   r  r   foreach_noder   r  r  rg   rj   r\  	  sn   	


z#ForeachKernelSchedulerNode.__init__r  c                   s   dd |D }|rt dt|dd |D  dd |D }|r(t dt| dd |D }|r9t dt| d	d |D }d
d |D }|rQt dt| dd |D }dd |D   rjt dt    fdd|D }tjrdd |D }|rt dt| dd |D }|S )Nc                 S     g | ]	}t |tr|qS ri   )r   rQ  r  ri   ri   rj   r   	      z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>z/ComboKernels: %d external nodes are filtered %sc                 S  s    g | ]}|j d ur|j  qS rt   r   r#  r  ri   ri   rj   r   	  s     c                 S  r  ri   )r   rw  r  ri   ri   rj   r   	  r  z+ComboKernels: %d grouped nodes are filteredc                 S  r  ri   )r   r  r  ri   ri   rj   r   	  r  z;ComboKernels: %d FusedMixOrderReductions nodes are filteredc                 S  s"   g | ]}t |ttttfs|qS ri   )r   r  rQ  rw  r  r  ri   ri   rj   r   
  s    c                 S  r  ri   r   r  r  ri   ri   rj   r   
  
    
z+ComboKernels: %d foreach nodes are filteredc                 S  s   g | ]	}t |ts|qS ri   r  r  ri   ri   rj   r   
  r  c                 S     g | ]}|  r|qS ri   r  r  ri   ri   rj   r   
      z0ComboKernels: %d template nodes are filtered: %sc                   s   g | ]}| vr|qS ri   ri   r  template_nodesri   rj   r   !
  r  c                 S  r  ri   r  r  ri   ri   rj   r   %
  r  zCComboKernels: %d reduction nodes are filtered (pointwise_only mode)c                 S  s   g | ]}|  s|qS ri   r  r  ri   ri   rj   r   +
  r  )r|  r  r   r'   combo_kernels_pointwise_only)rn   r  externgrouped	mix_orderfiltered_nodesforeach_nodesreduction_nodesri   r  rj   combinable_nodes	  s^   z+ForeachKernelSchedulerNode.combinable_nodeslist[list[BaseSchedulerNode]]c           
   
     s   |   }g }d tdd |D }|D ]`}tt}|D ]!}| }|r.|jdks-|jdkr.q| |@ r5q|| | q| D ]1}tt}	|D ]}|	| j	
|d | qI|	 D ]| fddtdt D  q\qAq|S )zS
        Returns a list of lists of nodes that are to be grouped together.
           c                 S  s2   g | ]}|D ]}t |tr| D ]}|qqqS ri   )r   r  r  )r   r   r   r  ri   ri   rj   r   ;
  s    
zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>mpsr  r   c                   s   g | ]
}||   qS ri   ri   )r   r  max_num_nodesstream_nodesri   rj   r   [
  s    )_topological_sort_nodesr   r   r   r   r   r   r   r   node_to_streamr  r  r'  r   )
r  sorted_nodesgrouped_nodesexcluded_buffer_namesr  device_groupsr   r  device_nodesstream_groupsri   r  rj   &_default_group_nodes_for_combo_kernels/
  s>   	zAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelscustom_group_algorithmc                 C  s
   | t _d S rt   r  r  )r	  ri   ri   rj   %set_group_algorithm_for_combo_kernelsf
  s   z@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernelsc                 C  s
   t | S rt   r
  r  ri   ri   rj   group_nodes_for_combo_kernelsn
  s   
z8ForeachKernelSchedulerNode.group_nodes_for_combo_kernelsc                 C  r  rt   r  rg   ri   ri   rj   r  t
  r  z#ForeachKernelSchedulerNode.mark_runc                 C  r  rt   r  rg   ri   ri   rj   r  w
  r  z"ForeachKernelSchedulerNode.codegenc                 C  r  r  ri   rg   ri   ri   rj   r  z
  r  z%ForeachKernelSchedulerNode.is_foreachc                 C  s
   t | jS )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r   rg   ri   ri   rj   get_subkernel_nodes}
  s   
z.ForeachKernelSchedulerNode.get_subkernel_nodesr  c                 C  s   t tjdd | jD S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c                 s  r  rt   )r   r  ri   ri   rj   r   
  r  z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>)r   r  r  rI  r   rg   ri   ri   rj   r   
  s   z$ForeachKernelSchedulerNode.get_nodesr   c                 C  r  r   )r   r  rg   ri   ri   rj   r  
  r:  z)ForeachKernelSchedulerNode.get_first_namer  r  c                 C  s*   t | || jj | jD ]}|| qd S rt   )r  r  r*  r   r  )rh   r  r   ri   ri   rj   r  
  s   
z/ForeachKernelSchedulerNode.prune_redundant_deps)r  r^   r}   r  )r  r^   r}   r  r  r^   r  r^   r}   rl   )r  r^   r  r^   r}   r  )NNF)r  r   r   r  r  rl   r  r  r  r  r  rl   r}   r  r  r  r}   r  )r  r   r}   r  )r	  r  r}   r  r=  r>  r}   r  r  r;  r  )ru   rv   rw   r   r  r  ry   r   ro   r\  r  r   r  r  rx   r  r  r  r  r  r  r   r  r  r  ri   ri   r  rj   r  	  s:   
 

	.ENA4






r  c                      s   e Zd ZU dZded< ed.ddZ	d/d0 fddZd1ddZd2ddZ	e
d3ddZd3ddZe
d4ddZd5ddZe
d6d!d"Zd7d$d%Zd8d'd(Zed9d,d-Z  ZS ):rw  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r   r}   c                   sX   |d j  t fdd|D sJ |  |}|D ]	}| j| < q| j| < |S )Nr   c                 3  s    | ]}|j  u V  qd S rt   r  r  r  ri   rj   r   
  r  z.GroupedSchedulerNode.create.<locals>.<genexpr>)r  r   r  r  )rn   r   grouped_snoder  ri   r  rj   create
  s   

zGroupedSchedulerNode.createFr  r   temp_groupingrl   r  c                   s"   t  | t| || || _d S rt   )r  r\  rx  r  )rh   r  r   r  r  ri   rj   r\  
  s   
zGroupedSchedulerNode.__init__c                 C  sD   | j r| jS | jD ]
}|| jj| < q	| jj|  = | j| jS )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  r   r  r  r  
fuse_nodes)rh   r  ri   ri   rj   unpack
  s   
zGroupedSchedulerNode.unpackfake_depr2   c                 C  s"   |  | j| | j| d S rt   )r  r   r  rV  r  )rh   r  ri   ri   rj   r  
  s   z!GroupedSchedulerNode.add_fake_depr   c                 C  r  )Nr  c                 S  r  ri   r`  r  ri   ri   rj   r   
  r  z1GroupedSchedulerNode.get_name.<locals>.<listcomp>r  rg   ri   ri   rj   r  
  r  zGroupedSchedulerNode.get_namec                 C  r  r   r  rg   ri   ri   rj   r  
  r:  z#GroupedSchedulerNode.get_first_namerD  c                 C  r  )Nc                 S  r  ri   r  r  ri   ri   rj   r   
  r  z9GroupedSchedulerNode.get_buffer_names.<locals>.<listcomp>r  rg   ri   ri   rj   r  
  r  z%GroupedSchedulerNode.get_buffer_namesrN  c                 C  r  rt   r  r  ri   ri   rj   rd  
  r  z GroupedSchedulerNode.get_outputsr\  c                 C  rz  )Nc                 s  r{  rt   r|  r  ri   ri   rj   r   
  r}  z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>r   r~  r  ri   ri   rj   re  
  r  z#GroupedSchedulerNode.estimate_flopsr  c                 C  r  rt   rC  rg   ri   ri   rj   r   
  r  zGroupedSchedulerNode.get_nodesr9  c                 C  s   | j r
| j d  S d S r   )r   r   rg   ri   ri   rj   r   
  rA  zGroupedSchedulerNode.get_devicer  r^   r  c                 C  r  r  ri   )rn   r  r  ri   ri   rj   r   
  r  zGroupedSchedulerNode.can_fuse)r   r  r}   rw  )F)r  r   r   r  r  rl   r}   r  r  )r  r2   r}   r  r;  r  r  r  r  r@  r  )ru   rv   rw   r   rx   ry   r  r\  r  r  rH   r  r  r  rd  re  r   r   r   r  ri   ri   r  rj   rw  
  s*   
 	





rw  ri   stride_lengthslist[list[int]]r#  rC  priority_idxr   	list[int]c                   sb   t jd fdd}ttttd }t|dkr&fdd	|D tjr/|j|d
 |S )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    r  r   br}   c                   s     dks dkrt   dk dkS  fddD }fddD }tdd t||D }tdd t||D }||krIdS ||krOdS t  S )	Nr   c                      g | ]}t |  qS ri   absr   sl)r  ri   rj   r   
  r  z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>c                   r  ri   r  r   )r  ri   rj   r     r  c                 s  s$    | ]\}}|d kp||k V  qdS r   Nri   r   sl_asl_bri   ri   rj   r         
z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>c                 s  s$    | ]\}}|d kp||k V  qdS r"  ri   r#  ri   ri   rj   r     r&  r   )rI   r   rH  )r  r  stride_len_astride_len_ba_firstb_firstr#  r  )r  r  rj   	index_cmp  s   
z"pick_loop_order.<locals>.index_cmpr   c                      g | ]} | qS ri   ri   )r   pi)r  ri   rj   r      r  z#pick_loop_order.<locals>.<listcomp>r  N)r  r   r  r   r}   r   )		functools
cmp_to_keyr   r   r'  r   r'   pick_loop_orderssort)r  r#  r  r,  orderri   r+  rj   pick_loop_order
  s   
r4  	orig_nodeir.MultiTemplateBufferr  ir.OperationBufferc                 C  s   |  }|   }t|trt|tsJ | }|  }t|tr&t|ts(J tjj|= ||_tjj|= ||_	tjj
| }tjj
| |tjj
|< |tjj|< tjj| }tjj| |tjj|< |tjj|< d S rt   )r  r   r   r  rY   r   r  r   
name_to_opoperation_namebuffersr   r  
operations)r5  r  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigri   ri   rj   _replace_operation_buffer&  s$   

rA  rf  c                 C  s4   |   }| }|| }|| }|d|  }|| S r  )r7  r6  )r{   r|   epilogue_runtimetemplate_write_bytesepilogue_read_bytesextra_bytesextra_bytes_ratioextra_memory_ratiori   ri   rj    _estimate_fused_epilogue_runtimeB  s   rH  unfused_n_regsr   fused_n_regsfused_n_spills	num_warpsdevice_propsrB   tuple[int, int]c                 C  s\   |dkrdS |j }|d u rdS |sJ ||jpd }| | }|| }|| }	|| }
|	|
fS )Nr  )r   r   )r   r       )regs_per_multiprocessor	warp_size)rI  rJ  rK  rL  rM  regs_per_smthreads_per_blockregs_per_block_unfusedregs_per_block_fusedblocks_unfusedblocks_fusedri   ri   rj   "_occupancy_before_and_after_fusionO  s   rX  ms1ms2rl   c                 C  sP   d}d}t |||||\}	}
|d|  ko|
dk}|
dko'|
|kp'|
|	 |kp'|S )zE
    Determine whether to fuse an epilogue into a GEMM template.
    r  g      ?r   r   r   )rX  )rY  rZ  rI  rJ  rK  rL  rM  MIN_ACCEPTED_OCCUPANCYREGRESSED_OCCUPANCY_RATIOrV  rW  ,epilogue_dominated_with_sufficient_occupancyri   ri   rj   _fuse_epiloguej  s   

r^  c                   @  sV   e Zd ZU ded< dZded< dZded< dd	d
ZdddZdddZdddZ	dS )r  BaseSchedulerNode | OutputNoder   Frl   r  is_weakr}   r   c                 C  s   t | j | j| jfS rt   )r  r   r  r  r`  rg   ri   ri   rj   r    rA  zNodeUser.__hash__r  objectc                 C  s2   t |to|  | ko| j|jko| j|jkS rt   )r   r  r  r  r`  r  ri   ri   rj   __eq__  s   


zNodeUser.__eq__r   c                 C  r  rt   r  rg   ri   ri   rj   r    r  zNodeUser.get_namec                 C  s.   | j |j u sJ t| j | jo|j| jo|jS rt   )r   r  r  r`  r  ri   ri   rj   r4    s   

zNodeUser.mergeNr<  )r  ra  r}   rl   r;  )r  r  r}   r  )
ru   rv   rw   rx   r  r`  r  rb  r  r4  ri   ri   ri   rj   r    s   
 


r  c                   C  s   t jS rt   )r'   r  ri   ri   ri   rj   *used_non_deterministic_runtime_estimations  r  rc  	ir.IRNodeOrderedSet[sympy.Symbol]c                 C  sx   t  }|  }t|tjr/|t|jt|jB t|j	B  t|tj
r-|t|j |S |du s:J d| |S )z=Get free symbols from a node's layout (size, stride, offset).Nz*Expect layout to be None but found layout=)r   maybe_get_layoutr   r*   Layoutr  r!   r   strideoffsetr  get_layout_symintsr$  )r   free_symbol_usesr  ri   ri   rj   rj    s   rj  c                 C  sX   t | trt jdd | jD  S | jdusJ | j }|jdd | j D   |S )z
    Gets symbols used in a scheduler node, including free symbols from
    the node's operations and layout symints from outputs.
    c                 s      | ]}t |V  qd S rt   get_scheduler_node_symbol_uses)r   r  ri   ri   rj   r     r  z1get_scheduler_node_symbol_uses.<locals>.<genexpr>Nc                 s  rl  rt   )rj  )r   ir_noderi   ri   rj   r     r  )	r   r   r   rq  r   r   get_free_symbol_usesr  rd  )r   rk  ri   ri   rj   rn    s   

rn  r  c                 C  &   |   }|dur|jdur|jS tjS z4Check per-template flag, fall back to global config.N)r  allow_epilogue_fusionr'   epilogue_fusionr  tbri   ri   rj   _is_epilogue_fusion_enabled     rw  c                 C  rq  rr  )r  allow_prologue_fusionr'   prologue_fusionru  ri   ri   rj   _is_prologue_fusion_enabled  rx  r{  r{   r|   c                 C  s   |   o|   ot| S rt   )r  rw  r   ri   ri   rj   is_epilogue_fusion  
   r|  c                 C  s   |  o|    ot|S rt   )r  r{  r   ri   ri   rj   is_prologue_fusion  r}  r~  c                 C  s   t | |p	t| |S rt   )r|  r~  r   ri   ri   rj   is_template_fusion  s   r  c                 C  s   t | |r|S | S rt   r|  r   ri   ri   rj   template_fusion_pw_node  r>  r  c                      s  e Zd ZdZd9ddZd9 fdd	Zd:ddZd;ddZd<ddZd=ddZ	d>ddZ
ed?ddZejd@ddZd;d d!ZdAd#d$ZdBd&d'Zd;d(d)Zd;d*d+Zd;d,d-Zd;d.d/ZdCd2d3ZdDd5d6Zd;d7d8ZdEd:d;ZdFd=d>Zd;d?d@Zd;dAdBZd;dCdDZdDdEdFZd;dGdHZdGdKdLZ	MdHdIdQdRZ dJdVdWZ!dKdZd[Z"d;d\d]Z#dLdbdcZ$dMdedfZ%	MdHdNdhdiZ&dOdmdnZ'dPdodpZ(dQdsdtZ)dRdwdxZ*dSd{d|Z+dTddZ,dUddZ-dVddZ.dWddZ/dHdXddZ0dYddZ1dZddZ2d[ddZ3d[ddZ4d\ddZ5d[ddZ6d]ddZ7d^ddZ8d^ddZ9d_ddZ:d`ddZ;daddZ<		dbdcddZ=d[ddZ>ddddZ?deddZ@dfddƄZAdgdhddʄZB			didjdd΄ZCd[ddЄZDd^dd҄ZEdkddԄZFdldd؄ZGd;ddڄZHd;dd܄ZId;ddބZJdmddZKdnddZLdoddZMdpddZNdqddZOdrddZPeQdsddZRdrddZSdtddZTduddZUdvdd ZVdwddZWdxd	d
ZXdDddZYdDddZZdDddZ[dyddZ\dzddZ]dpddZ^d;ddZ_d{ddZ`d|ddZadzdd Zbd;d!d"ZcdYd#d$Zdd}d&d'Zed~d(d)Zfdd+d,Zgd;d-d.Zhedd/d0Ziedd1d2Zjdpd3d4Zkd;d5d6Zldpd7d8Zm  ZnS (  r   z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    r  list[ir.Operation]r}   r  c                 C  s8   t d | | W d    d S 1 sw   Y  d S )NScheduler.__init__)r   _initrh   r  ri   ri   rj   r\    s   
"r  c                   s  t     tj_i  _tt _t	
  _t  _tg tjj tjj tjj  _ fdd|D  _d  _d  _    jtjj   jD ]}|  qRd  _   _dd  jD  _dd  jD  _ j  _i  _ i  _!t  _"t#$ j j j _ %   & j _ '  dd  jD  _ (   )  t* j+t, j7  _+ddl-m.}m/} | j t, j _0 1   & j _tt2t3t3f    _4t5j6d urt56 j _t5j7rd	d
l8m9} |:   (  i  _;i  _<d _=i  _> ?   @ j _t5jAd ur"t5A j _tBdd  jD r1 '   C   D  t5jEsAt5jFrLtG rLtHjIjJjKL  t5jMrntNdddd  jOd d W d    n	1 siw   Y   P  t5jQrd	dlRmQ} | j j jttjj ttjS  _t5jTst5jUrt5jQsd	dlRmV} | j j tW rtXjYrt5jZstXj[rd} jD ]}t\|j]rd} nq|rd	dl#m^}	 |	 j tXj_rddl`ma}
 |
ddd  fddd t#b j _ c  t5jdrt5jejfrt5jejgr h j _ i j _ j  tHjIj5jkjlr* m  | j tjno j  p  t  _qi  _rd  _sttdu fdd t  _vd S )Nc                   s   g | ]}  |qS ri   )create_scheduler_noder  rg   ri   rj   r     rc  z#Scheduler._init.<locals>.<listcomp>c                 S  r_  ri   r`  r  ri   ri   rj   rb  &  rv  z#Scheduler._init.<locals>.<dictcomp>c                 S  s$   i | ]}|  D ]}| |qqS ri   )rd  r  )r   r   r   ri   ri   rj   rb  *  s
    
c                 S  r_  ri   r`  r  ri   ri   rj   rb  N  rc  r   )log_ir_post_fusionlog_ir_pre_fusionr   )distributed_autotuneFc                 s      | ]}t |tV  qd S rt   )r   r  r  ri   ri   rj   r   q  s
    
z"Scheduler._init.<locals>.<genexpr>#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodes)reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffers)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                   S  s
   dddS )N#scheduler_nodes_before_comm_overlapstring)r   encodingri   ri   ri   ri   rj   rY    s   z!Scheduler._init.<locals>.<lambda>c                     s   d dd t jD S )Nz

c                 S  s2   g | ]\}}d | d|   d|   qS )zsnode[r  z buffer_names:)r  r  r  ri   ri   rj   r     s    
z5Scheduler._init.<locals>.<lambda>.<locals>.<listcomp>)r9  r  r  ri   rg   ri   rj   rY    s
    )metadata_fn
payload_fngraph_statsc                     s    j  jt jdS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r  ri   rg   ri   rj   rY    s   )wr  r\  rY   r   r  backendsr  _post_grad_graph_counterr  r  count_graph_partition_counterr   r  rX  r   	constantstorchbind_constantsr  r  previous_nodecurrent_nodeupdate_zero_dim_cpu_tensorr  r  default_device_contextget_donated_buffersr)  r  r*  copyr  r  rL  seen_template_fusionsr&   decide_global_ordering_of_commsr\   topological_sort_scheduledead_node_eliminationcompute_ancestorscompute_input_distancesr+   ir_nodes_pre_fusionr   torch._inductor.debugr  r  r  create_foreach_nodesr   r   logged_slow_fusionr'   _pre_fusion_custom_passdistributed_max_autotune_gemmr  r  scheduler  buff_to_stream_multi_stream_nodesstream_idx_to_user_obj_idx_populate_stream_assignmentsr  _post_fusion_custom_passr   r3  finalize_multi_template_buffersmax_autotune_gemmmax_autotuner   r  r  select_algorithmPrecompileThreadPoolshutdown_instancecombo_kernelsr   create_combo_kernel_nodes_enforce_conditional_orderingr  memoryget_output_namesdeterministic reorder_for_compute_comm_overlapr  rc  r(   6runtime_estimations_align_across_all_distributed_ranksr  rl  rR   r   r  reorder_sink_verbose_loggingtorch._loggingr  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesgraph_partitionr   r]   %reorder_for_reducing_graph_partitions&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_index_current_stream_ctxr   add_rowremoved_ops)rh   r  r   r  r  r  r  r  has_collectivesr  r  r  rg   rj   r    s0  












	






zScheduler._init!dict[str, SchedulerDonatedBuffer]c                 C  sD   i }t jjD ]}tt jj| tjrt| t jj| d d||< q|S )N)r  )rY   r   graph_inputs_originalr   r*   DonatedBufferrC  )rh   name_to_donated_bufr   ri   ri   rj   r    s   

zScheduler.get_donated_buffersc           
        s4  ddl m  i }td}| jD ]8} }|jdur7|j }|dur7||vr3t|}|||< || j|< || }|| j	|< |
 D ]}|| j|< q@qt fdd| j	 D rtdd | jD d}|dur| jD ]}|j}	| du rt|	tjrt|	jtjrtj|d|	_qit fdd| j	 D | _dS )	a=  Populate node_to_stream and buff_to_stream from IR node stream_idx.

        Reads the stream_idx field set on IR nodes during lowering to determine
        which stream each scheduler node should run on. This field is propagated
        from 'custom.stream' FX node metadata via IRNode.current_stream_idx().
        r   DEFAULT_STREAM_IDXNc                 3      | ]}| kV  qd S rt   ri   r   r;  r  ri   rj   r   '  r  z9Scheduler._populate_stream_assignments.<locals>.<genexpr>c                 s  s$    | ]}|  d ur|  V  qd S rt   r   r  ri   ri   rj   r   )  s   " r  c                 3  r  rt   ri   )r   
stream_idxr  ri   rj   r   7  
    
)stream_constantsr  r  r  r  r   get_stream_idxr  r  r  r  r  r   r   r   r   r*   Bufferr  r>   r  )
rh   user_obj_to_stream_idxstream_idx_counterr   r  user_obj_idxnew_stream_idxr   r  ro  ri   r  rj   r    sD   







z&Scheduler._populate_stream_assignmentsrl   c                 C  r  )z7Check if any nodes are assigned to non-default streams.)r  rg   ri   ri   rj   _has_multi_stream_nodes<     z!Scheduler._has_multi_stream_nodesr  r   r   c                 C  s&   | j ||}| j|| j|dS )zAReturn the stream index for a buffer, resolving mutation renames.r   )rL  r  r  )rh   r  realri   ri   rj   get_buf_stream@  s   zScheduler.get_buf_streamr   r^   c                 C  s$   |   sdS | || j|dkS )zTrue if buf_name was produced on a different stream than node.

        Resolves mutation renames so that mutated buffers inherit the
        stream of their original definition.
        Fr   )r  r  r  r  )rh   r  r   ri   ri   rj   r  E  s   z!Scheduler.has_cross_stream_hazardr9  c                 C  s   t jjS rt   rY   r   current_devicerg   ri   ri   rj   r  O     zScheduler.current_devicer  c                 C  s   |t j_d S rt   r  r  ri   ri   rj   r  S  r  c                 C  s4   t jdddkrddlm} || jdd dS dS )z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r  r  r  )rh   r  ri   ri   rj   r  W  s   zScheduler.debug_draw_graphlabelc                 C  s4   t tjrt d| | jD ]}|  qd S d S )Nz%s:)r|  isEnabledForloggingINFOr  r  r  )rh   r   r   ri   ri   rj   debug_print_nodes^  s   

zScheduler.debug_print_nodesr]  c                 C  s`   |  d us
J d| rt| |S t|tjtjfr!t| |S t|tjr,t	| |S t
|)Nz2All nodes passed to scheduling must have an origin)r#  is_no_opr  r   r*   r   r  r   r  rQ  r  re  ri   ri   rj   r  d  s   


zScheduler.create_scheduler_nodec                   s   t  g }j  tjj D ]9} fdd|D }|sq| fdd|D }tj	dk}t
|d|d}|| |D ]}|j|< qAqfddjD t| _d S )Nc                   s(   g | ]}| v rt j| ts|qS ri   )r   r  r  r  )kept_node_namesrh   ri   rj   r   w  s    z2Scheduler.create_foreach_nodes.<locals>.<listcomp>c                   s   g | ]} j | qS ri   r  r  rg   ri   rj   r     rc  r   Fr  r  c                   s   g | ]
}|   vr|qS ri   r`  r  )removed_node_namesri   rj   r         )r   r  r   rY   r   listsr   r  r'   combo_kernels_autotuner  r   r  r   )rh   fe_nodesnamesr   r  fe_noder   ri   )r  r	  rh   rj   r  q  s6   





zScheduler.create_foreach_nodesc           $        s  G  fdddt t  t jD ]`}| D ]Y}| }t|jj	t
jr1t| dkr1q| D ]=}|v ra|v ra| }| }|| }D ]}| |u s[| |u r_||< qMq5|v rl| |< q5| |< q5qqd/fdd				d0d1fdd}	i }
tjj D ]1}t|tjr|jD ]}d|
|< qqt|t
jrdd | D }|D ]}|jD ]}d|
|< qqqd	}jD ]-}|jdusJ t|j dd d}|D ]}t|tjsJ d}||
vr| |
|< qqǈjD ]j}td|j |rK|jdusJ t|jjdddd d}|D ].}||
v s,J | d|
 |
|  }durIj|  D ]}|t|  q<qt|j j!dkrit"t#|j j! }rit|t$ri|j%}nd}| D ]p}t|& dks|J |& D ]]}|}|	|| |t||d | j'D ]B}| | krqt|jt(sJ |j D ]%}| }|}|| v }|t)|| | d |	||dd  qqqqotjj*|  D ]}|	||dd  |t)|| dd! qtjj+|  D ]}|	||d	d  |t| q|j j,D ]}t|t)s/|	|j-||.| q|/j0 | D ]'}|& D ]}| j0|< | j0|< j12||j1| < qAq;qtj3 D ]}td"| |	|t4t| qi|rtjj5D ]?}|jddD ]5}||
v sJ | d|
6  |
|  }rj| 7 D ]}td#|| |	|t4t| qqqj0D ],}|tjjv r|	|t4t| tjj89| q|tjj:v r|	|t4t| qd$d% t;tjj6 D fd&dtjj8D tj_<jD ]}| D ]}|=|  j' qqj>D ]}j>| =| j' q)t? } | @d' ' D ].\}}!| A  d(d |!j'D }"| @d)| d*|" d+ W d   n	1 slw   Y  qD| @d, | B C }#tDd- tDd.|# dS )2zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                      s8   e Zd ZdZ		ddd	d
ZdddZd fddZdS )z1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nr  list[_T] | None
membershipOrderedSet[_T] | Noner}   r  c                 S  s   |pg | _ |p	t | _d S rt   )r  r   r  )rh   r  r  ri   ri   rj   r\    s   
z:Scheduler.compute_dependencies.<locals>.DedupList.__init__	node_userr`   c                 S  s*   || j v rd S | j| | j | d S rt   )r  r  r   r  )rh   r  ri   ri   rj   r     s   
z8Scheduler.compute_dependencies.<locals>.DedupList.appendr  DedupList[_T]c                   s4   t  j|j} j fdd|jD  }||S )Nc                   s   g | ]	}| j vr|qS ri   )r  r  rg   ri   rj   r     s    zMScheduler.compute_dependencies.<locals>.DedupList.__add__.<locals>.<listcomp>)r   rq  r  r  )rh   r  new_membership	new_items	DedupListrg   rj   __add__  s
   
z9Scheduler.compute_dependencies.<locals>.DedupList.__add__re  )r  r  r  r  r}   r  )r  r`   r}   r  )r  r  r}   r  )ru   rv   rw   r   r\  r   r  ri   r  ri   rj   r    s    
r  r   r  r   r}   c                   s   | j v r j |  S | S rt   rL  r  )r  rh   ri   rj   r    s   
z.Scheduler.compute_dependencies.<locals>.renameFused_by_namer  r_  r  rl   r`  r  c                   s    |   t||| d S rt   )r   r  )r  r  r  r`  )name_to_usersr  ri   rj   add_user  s   
z0Scheduler.compute_dependencies.<locals>.add_userNc                 S  s   g | ]
}t |tjr|qS ri   )r   r   r  r  ri   ri   rj   r     r   z2Scheduler.compute_dependencies.<locals>.<listcomp>c                 S  r  rt   r  r  ri   ri   rj   rY        z0Scheduler.compute_dependencies.<locals>.<lambda>r  Tzscheduling %s)unbacked_onlyc                 S  r  rt   r  r  ri   ri   rj   rY    r  z not in )r_  mutating_bufr  )r`  )r  zscheduling output %sz+scheduling output %s for unbacked symint %sc                 S     i | ]\}}||qS ri   ri   )r   r   r   ri   ri   rj   rb    rv  z2Scheduler.compute_dependencies.<locals>.<dictcomp>c                   r-  ri   ri   r  )	inp_namesri   rj   r         r  c                 S  r  ri   r`  )r   r  ri   ri   rj   r     r  'z': r  r   zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)r  r   r}   r   )FF)
r  r   r  r_  r  rl   r`  rl   r}   r  )Er   r`   rW  r   r  rd  r  r   r   r  r*   r>   r   r  rY   r   rX  r   r   r  r!   	TensorBoxr  r  get_unbacked_symbol_defsSymbolr|  r  rp  r  r  r4   r   ry  r  r[  r3   r_  r  r  r^   r5   additional_buffer_depsadditional_star_depsr   r   r  r  rL  r  r  r  r0  graph_outputsr   r  mutated_inputsr  r  r  mutated_input_idxsr5  r)  rQ   rw  r  r  r~  compute_dependencies_log)$rh   r   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_nodevalfssym_sizer;  has_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesr  r   r   	node_modealt_namer  out_buf
other_nameis_aliasadd_depr  r  r   r   logbufrj  r  r   ri   )r  r$  r  r  rh   rj   r\     s\   
	




	
	








zScheduler.compute_dependenciesc                   sF  ddl m}m}m}m} ttjj	 }| j
|}tjjjs&| j
 j ttj }| j
||\}}	}	dd tt j
D |D ]&}
|
jdkrR|
jdkrRqE|
j }|
j d | |
j d | qEddlm} |  d fdd}g }t j
D ]\}}|| ||||t j
d kd q| _
d S )Nr   )r  compute_memory_timelineFreeableInputBufferget_freeable_input_bufc                 S  s   g | ]}g g fqS ri   ri   )r   r  ri   ri   rj   r     r%  z7Scheduler.insert_memory_check_nodes.<locals>.<listcomp>r   )register_check_mem_opstep_idxr   is_final_steprl   r}   rQ  c                   sn   |  d }|  d }|||g}t jttddtjjjjg |dd d}d j	|  
  |_t |S )	Nr   r   r  r  c                 S  s   | |d |d |d dfS )Nr   r   r   )alivedeadrI  ri   )tensor_argsr  ri   ri   rj   rY    s   zWScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>)r  r'  rL  nontensor_argsunflatten_args
mem_check_)r*   MemoryCheckKernelr>   r  r  r  _inductor_debugcheck_memory_stepdefaultr  r  r9  rQ  )rH  rI  expected_newly_aliveexpected_newly_deadrM  r   rh   step_allocs_deallocsri   rj   construct_mem_check_node  s   


zEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node)rI  )rH  r   rI  rl   r}   rQ  )r  r  rD  rE  rF  r   rY   r   rX  r   r  r  r  r'   r  r*  r  r'  r   
size_alloc	size_freer  r  
start_stepr   end_step#torch._inductor.runtime.debug_utilsrG  r  )rh   r  rD  rE  rF  rX  name_to_freeable_input_bufr,  buf_info_listr  buf_infor  rG  rX  	new_nodesr  r   ri   rV  rj   r    sB   





z#Scheduler.insert_memory_check_nodesc                   s*  t jsdS g }t| jD ]uddd d} D ]$}t fd	d
|jD }|r;td|	  t
jj|	  qd}q  oE| }|sN| qtd	  t
jj	  jjD ]}|j| jv r| j|j j}fdd|D | j|j _qcqtt|| _| jD ]  qdS )z0
        Remove any nodes without users
        Nr  r  r}   rl   c                 S  s   | j p
|  tjjv S rt   )r`  r  rY   r   r  )r  ri   ri   rj   can_eliminate_user  r  z;Scheduler.dead_node_elimination.<locals>.can_eliminate_userFc                 3      | ]} |V  qd S rt   ri   r   u)rb  ri   rj   r     r  z2Scheduler.dead_node_elimination.<locals>.<genexpr>zremoved dead buffer: %sTzremoved dead operation: %sc                   s"   g | ]}|j    kr|qS ri   r  rd  r   ri   rj   r     s    z3Scheduler.dead_node_elimination.<locals>.<listcomp>)r  r  r}   rl   )r'   use_dcer   r  rd  r   r  r|  r  r  rY   r   r  r  r  r   r  r   r   r   r*  r   r  )rh   updated_nodesactive_buffersr   can_eliminater  r  ri   )rb  r   rj   r    s:   



zScheduler.dead_node_eliminationr_  
str | Nonec                 C  s   |duS )z:Check if store mode requires cross-thread synchronization.Nri   )rh   r_  ri   ri   rj   mode_requires_synchronization  r  z'Scheduler.mode_requires_synchronizationr  c                   s^   t t  t  g d fdd|D ]}| D ]}| |< qq|D ]}| q&S )	z?
        Ensure nodes is in topologically sorted order
        r  r^   r}   r  c                   sV   | vr) |  t| jdd dD ]}|j vrq |j  q|  d S d S )Nc                 S  r  rt   r  )dri   ri   rj   rY  /  r  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>r  )r  r  rV  r   r   )r  r   r  r  seenvisitri   rj   ro  ,  s   

z2Scheduler.topological_sort_schedule.<locals>.visitN)r  r^   r}   r  )r   r^   rG  r  )rh   r  r   r   ri   rm  rj   r  "  s   



z#Scheduler.topological_sort_schedulec                 C  sl   dd | j D }tdt|D ]$}tt||  }tt||d   }|| t||dd qd S )Nc                 S  s   g | ]}t |jtjr|qS ri   )r   r   r*   Conditionalr  ri   ri   rj   r   >  s
    z;Scheduler._enforce_conditional_ordering.<locals>.<listcomp>r   Tr!  )r  r'  r   r  r[  r  r  r5   )rh   conditional_nodesr  r"  prev_bufri   ri   rj   r  =  s   z'Scheduler._enforce_conditional_orderingr  c                   st   t  }t|tttttfr|jD ]}||j	 qn
t
dt| d fdd|D }tt  fdd|D S )Nz+get_unmet_dep_nodes is not implemented for .c                 3  s    | ]
} j |  V  qd S rt   )r*  r
  r   rg   ri   rj   r   Z      z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>c                 3  s    | ]} j | V  qd S rt   r  r  rg   ri   rj   r   [  r  )r   r   r   rQ  r  r   rw  rV  r  r   RuntimeErrorr   r   )rh   r  
unmet_depsr   unmet_dep_opsri   rg   rj   _get_unmet_dep_nodesH  s$   

zScheduler._get_unmet_dep_nodesr  c                 C  s   g }t | jd}i }| jD ]!}| |}t|||< |D ]}||g }|| |||< qqdd | D }|rf|| |D ]}	||	g D ]
}
||
  d8  < qJ||	 qBdd | D }|s;|rlJ d|S )zU
        Sort nodes by their topological order, return a list of node lists.
        r   c                 S     g | ]
\}}|d kr|qS r   ri   r   r  r  ri   ri   rj   r   l  r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>r   c                 S  rz  r{  ri   r|  ri   ri   rj   r   s  r   zTopological sort failed!)	rG  fromkeysr  ry  r   r  r   r  r  )rh   r3  r  childrenr   r  r   czero_deg_nodesr  r  ri   ri   rj   r  ]  s,   




z!Scheduler._topological_sort_nodesc                 C  s~   i }| j D ]'}t }|jD ]}| j|j  }|| ||| O }q||| < ||_qt	| j D ]
\}}||_
||_q2dS )z.
        Populate each node.ancestors
        N)r  r   rV  r*  r   r
  r  r  r   r  rH  rI  )rh   name_to_ancestorsr   r   r   dep_node_namer3  ri   ri   rj   r  w  s   


zScheduler.compute_ancestorsc                   s   i i  j D ]:}|jsd}d}nfdd|jD } fdd|jD }t|}t|}|| < | | < ||_||_qdS )z
        Populate each node's min/max_input_distance with the depth from graph
        inputs, measured as dependency hops before fusion. Nodes whose
        dependencies are all satisfied by graph inputs/constants have depth 0.
        r   c                   $   g | ]} j |j   d  qS r   r*  r   r
  r   )name_to_min_distancerh   ri   rj   r         z5Scheduler.compute_input_distances.<locals>.<listcomp>c                   r  r  r  r   )name_to_max_distancerh   ri   rj   r     r  N)r  rV  rL  rv  r  rF  rG  )rh   r   min_distmax_distdep_min_distsdep_max_distsri   )r  r  rh   rj   r    s&   
z!Scheduler.compute_input_distancesc                 C  sf   t jsd S | jD ](}t|ttfr| st jdkrq| D ]}t|tr*|	 r+q|
  qqd S )Nhalide)r'   r  r  r   r   r   rT   cpu_backendr   r  r3  )rh   r   r  ri   ri   rj   r3    s   


zScheduler.merge_loopsc                 C  s   t ddddR tdD ]6}t|}td|d | | j|dd}t|}td	|d || ||ks8|dkrBtd
|d   nqtjsItjrP| j|dd}|W  d   S 1 s\w   Y  dS )zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesTr  r  z/===== attempting fusion (%d/10): %d nodes =====r   F)is_reorder_roundz=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)	r   r'  r   r  r  fuse_nodes_oncer'   r  loop_index_inversion_in_fusion)rh   r  r  old_lennew_lenri   ri   rj   r    s>   $zScheduler.fuse_nodesc                 C  s8   g }| j D ]}|t|tr| n|g q|| _ dS )zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r  r   rw  r  )rh   ra  r   ri   ri   rj   r    s   

zScheduler.process_grouped_nodesr  tuple[float, str]c                 C  sh   t |dksJ |d  }|| _| |}tdddd ||W  d   S 1 s-w   Y  dS )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)r  dynamo_compile_column_usN)r   r   r  r
  r   r  )rh   r  r  r  ri   ri   rj   r    s   
$zScheduler.benchmark_fused_nodesNbenchmark_kernelhint_overrider\  c                 C  sh   t |dksJ |d  }|| _| |}td |j|||dW  d   S 1 s-w   Y  dS )r  r   generate_kernel_code_from_nodesr  N)r   r   r  r
  r   r  )rh   r  r  r  r  r  ri   ri   rj   r    s   


$z)Scheduler.generate_kernel_code_from_nodesmoduler   r  c                 C  sF   || _ | |}td ||W  d   S 1 sw   Y  dS )r  benchmark_codegened_moduleN)r  r
  r   r  )rh   r  r  r  ri   ri   rj   r    s
   

$z$Scheduler.benchmark_codegened_module
multi_noder6  c                 C  s   t jj}|sdS td|| |jD ]B}| }t|ddr(||vs(t|t	j
r)q|j}|| }t|t	jr?||j |j}t|t	jrT||krTtd|||  dS qdS )z
        Check if selecting a Triton template would cause layout conflicts.
        Returns True if there's a conflict and we should fall back to ATen.
        FzNode %s has constraints %sr  NzOLayout conflict detected for %s: template expects %s but layout is frozen to %sT)rY   r   buffer_layout_constraintsr|  r  r  r  rx  r   r*   ReinterpretViewr  FlexibleLayout freeze_layout_with_exact_stridesrh  FixedLayoutr}  )rh   r  constraintsinpinp_namer  expected_layoutri   ri   rj   !_has_layout_conflict_for_template"  s6   


z+Scheduler._has_layout_conflict_for_templatec              	   C  s  t | jD ]\}}t|trt|jtjr|j}tjj	s#|
 \}}ntdd | D }t|tjjjrZ| |rZ| D ]}t|tjjjrM|} nq?t|tjjjsZJ dt|tjjjrtjri }||d< tjD ]!}|j|d}	dd |	 D }
t|
 dd	 d
d }|||< qn|j| n|j| qtj|j | }W d   n1 sw   Y  |j}t|tjsJ |j}t|tjsJ |jrt ||j |j!|_!| "|||| qdS )a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        c                 s  s$    | ]}t |tjjjr|V  qd S rt   )r   r  r  r  ExternKernelCaller)r   timingri   ri   rj   r   `  s    
z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>zZNo extern kernel detected to fallback to when layout constraints fail for Triton templatesNr  c                 S  s    i | ]\}}t |tr||qS ri   r   r   )r   r  r  ri   ri   rj   rb    s    z=Scheduler.finalize_multi_template_buffers.<locals>.<dictcomp>c                 S     | d S r  ri   r  ri   ri   rj   rY    r  z;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>r  r   )#r  r  r   r   r   r*   MultiTemplateBufferr'   r  %force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr  r  r   r  r  r  multi_kernel_hintsr  rL  finalize_as_triton_callersfinalize_as_triton_callerrk  current_originsr)  output_noder   
StorageBoxOperationBufferorigin_noder9   r  _replace_node)rh   r  r   r  min_node_unfusedr  choicecallershinttimingstriton_timingsout_tensorboxout_storage
out_bufferri   ri   rj   r  M  sx   





z)Scheduler.finalize_multi_template_buffersr  r7  r  r   c                   s   t || | |}|| j|< || j| < || j| < i  t|jj	|j
D ]}| j|jd  }r9|j |< q(d fdd}||j
|_
||jj	|j_	t| | D ]\}	}
|	| j|
 < |
j|	_qX|j|_|j|_|j|_|j|_d S )Nr  rU  r}   c                   s   t  fdd| D S )Nc                 3  s    | ]}|  V  qd S rt   )r  r   r  ri   rj   r     r  z?Scheduler._replace_node.<locals>.rename_deps.<locals>.<genexpr>r   )r  r  ri   rj   rename_deps  r  z,Scheduler._replace_node.<locals>.rename_deps)r  rU  r}   rU  )rA  r  r  r  r  r  r  r  r   r   rV  r  r  r   rH  rd  r*  r  rH  rI  r   rE  )rh   r  r  r  r   new_scheduler_noder   	real_namer  new_outold_outri   r  rj   r    s4   




zScheduler._replace_node	node_listc                 C  s   t dd |D S )Nc                 s  sB    | ]}t |jd o|jduot |jjdo|jjjdkV  qdS )r   Nscatter_moder`  )r&  r   r   r  r  ri   ri   rj   r     s    
z,Scheduler._any_atomic_add.<locals>.<genexpr>)r   )rh   r  ri   ri   rj   _any_atomic_add  s   zScheduler._any_atomic_add&tuple[LambdaFuture | None, ModuleType]c                 C  s^   | j |d|d}t|}tjj }| sd }||fS |jd|d}t	|t
s+J ||fS )NT)r  r  triton_)kernel_namesource_code)r  r   loadr  r  async_compileAsyncCompileuse_process_poolr   r   r   )rh   r  r  src_codemodr  futri   ri   rj   compile_kernel  s   
zScheduler.compile_kernelr{   r|   rb   c                   sb  t dd fD }tjs|stdS  r!t tj	r)
 s)
 r.tdS  }|d  s<J jdkrKtjdkrKtdS  }tt||}|ratdS ddlm  t|d  dusxJ d&fdd|rt dd fD r dur n ttjsJ rtdS i 
g tjD ]}|t dd dD ]1\}}	t|tjjjsqɈ | !|gj"||j#dR  W d   n1 sw   Y  qt$d}
d}i }D ]d\}}}z|dur|%  W n' t&y< } zt'(t)j*r1t'+ds-dnd| W Y d}~qd}~ww  | ,|\}}|||< ||
k rX|}
|}W d   n	1 scw   Y  q|j-|< t|t.swJ |
|< qtj/t0dd j1D }t2 o o|tj3k	t$dt$dd	s 4 \t t56dd}ndd j1D }rՈr͈7|n7|\}nstdS 8 t9ddl:m;} g d}|D ]m\}}t|t.sqst<|d r|j=j=krqr| kr nD|d7 }|tj3kr) n8 |( z!|g"|R  W n |yN   Y W d   qw W d   n	1 sZw   Y  qt>dkrmtdS d'	
fd"d#}t?|d d S "|"|"|d' fd$d#}tj?|d d%S )(
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c                 s  s(    | ]}|  ot| tjV  qd S rt   )r  r   r  r*   r  r  ri   ri   rj   r     s    
z.Scheduler.speedup_by_fusion.<locals>.<genexpr>Tr   r  r   CompilationErrorNms_fusedrf  rY  rZ  r}   r  c              	     st   t tjr8| || k r"t d   t|| |  d d S t d   t| ||  d d S d S )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r  r  DEBUGr  r  rD   rE   )r  rY  rZ  r   ri   rj   
log_fusion  s   z/Scheduler.speedup_by_fusion.<locals>.log_fusionc                 s  s    | ]	}|  d uV  qd S rt   r  r  ri   ri   rj   r   )  s    
Fc                 S  r  r  ri   r  ri   ri   rj   rY  =  r  z-Scheduler.speedup_by_fusion.<locals>.<lambda>r  r  infException in compiling %s: %sr  r  c                 s  r  rt   r  r   r  ri   ri   rj   r   h  s    

r   c                 S  s   g | ]}|d fqS r{  ri   r  ri   ri   rj   r   ~  r  z/Scheduler.speedup_by_fusion.<locals>.<listcomp>)	CantSplitallowed_prologue_inpsrl   c                    s(  t d} d }i }r1rttjsJ   \	fddD tfdddD ]\}}}z|d urB| }n sL|j}|	  nd }W n# t
yr } zttjrhtdsddnd	| W Y d }~q3d }~ww  r| |\}}	|||< || k r|} |}W d    n1 sw   Y  q3|kp
	 |  k}
|r|
r|	  |jr|jsJ |jd
 }|j}|j}t	
|j|||jjt}|r|} nq3 r| 	
  r| 	
 k r|d urtjr|d <  n|  r|jd < dS dS )Nr  c                   s   g | ]
}|d   v r|qS r{  ri   )r   
fut_choicer  ri   rj   r     
    zMScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<listcomp>c                   s    | d  S r   ri   r  r  ri   rj   rY    r  KScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>r  r  r  r  r   TF)rf  r   r*   r  r  r  r  r  r  
precompiler{  r  r  r  r  r  swap_as_triton_callerr  	launchersn_regsn_spillsr^  bmreqrL  rB   r  r'   r  r  r  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr  rf   	mod_fusedresr   r  pathfusible_choicecompiled_kernelrJ  rK  should_fuse_epilogue)bench_epiloguer  r  rt  future_choicesget_choice_timings_async hint_override_best_fusion_choicer  
min_choicerY  rZ  	ms2_fusedr  rh   ri   rj   benchmark_when_ready  s   




	
	


z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyc               
     sp  ddl m}  zd 
d 	d fD ]
}|d ur|  qd \ t r3d W dS 
d \trId W dS 	d \tr_d W dS   tdr  krfjvrjf t	d
 fd	d
   k W S  | y   Y dS  y } zdt|v rW Y d }~dS  d }~ww )Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc                	     s       dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratiori   ri   rY  rZ  r  path1path2
path_fusedri   rj   rY  `  s   
r  Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  r  r  r  isinfr   r  r  r   r  r   )r  r  r   )r  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  rh   r  r  rj   r  -  s`   


rq   )r  rf  rY  rf  rZ  rf  r}   r  r>  )@r   r'   benchmark_fusionrb   ro   r  r   r  r*   TritonTemplateBufferr  r   r   r   r  r   r  r  r  triton.compiler.errorsr  r  r  r  r  r  r  r  r  r  r  TritonTemplateCallerr  r   r  r  rf  r  r{  r  r  r  r  r  r  r  r   benchmark_epilogue_fusionr   choicesr    max_epilogue_benchmarked_choicesr  operator
itemgetterr  rg  rH  r  r  r&  r  r   rr   )rh   r{   r|   is_multi_templatenode_list_1node_list_2node_list_fusedr  r  r  r  r  r  rf   r  r   r  r  num_triton_callerschoice_timings_iterr
  r  triton_choicesunfused_timer  ri   )r  r  r  r  rt  r  r  r  r  r  r  r  r  rY  rZ  r  r  r{   r|   rh   r  rj   speedup_by_fusion  sB  





















(l


HzScheduler.speedup_by_fusionc                 C  s   | j |  S )z0Look up the node in Scheduler name_to_fused_node)r  r  re  ri   ri   rj   r  y  s   zScheduler.get_fused_noder  OrderedSet[BaseSchedulerNode]c                   s   t d| |  | }| |ksJ | ||| || || |  | j	 fdd 
 D  | j|}|d urM|| j <  S )Nzfusing %s with %sc                      i | ]}|   qS ri   r`  r  node3ri   rj   rb    rc  z,Scheduler.fuse_two_nodes.<locals>.<dictcomp>)r  r  r  r   r
  ro   r  r  r  r  r   r  r  )rh   r{   r|   r  r  stream1ri   r&  rj   fuse_two_nodes}  s   



zScheduler.fuse_two_nodes
speedup_fnrp   c                 C  s4   |  ||r| ||s| r| ||| dS dS NTF)r   will_fusion_create_cycler)  )rh   r{   r|   r*  r  ri   ri   rj   fuse_if_speedup  s   

zScheduler.fuse_if_speeduptemplate_fusion_candidates,dict[BaseSchedulerNode, list[PendingFusion]]c                 C  sj  |rg }i }t  }|D ]v}||v rt|| dksJ || d}t|| dkr/|| | \}}	|	|krCt||	s@J |}
n||ksIJ t||	sPJ |	}
| |
|
urZq|jrs|jj}|dusgJ |	| ||f||< q| 
||	|j|r|| qt|D ]}|| \}}| 
| |j| |j|j|r|| q|D ]}|| q|sdS dS )z
        Evaluate pending template fusions for a set of fusion candidate nodes.
        The fusion candidate nodes are pointwise nodes as potential epilogue
        or prologue fusions
        r   r   N)r   r   r  r  r   r|  r~  r  rf   r   r-  rd   r   r{   r|   )rh   r.  r  template_futuresfuture_to_pending_fusionfusions_to_remove	candidatepending_fusionr{   r|   r  fcandri   ri   rj   "_evaluate_pending_template_fusions  sV   






z,Scheduler._evaluate_pending_template_fusionspossible_fusion_pairs1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]pending_fusions&dict[BaseSchedulerNode, PendingFusion]template_fusion_nodesr  c                   s  d	 fdd}|D ]\}}|||  |} |}t||r+||fjv r+q|||r||s||}	|	jd urt|	j|||	jd}
t||rx||fjvs[J j	||f t
||}||vrpg ||< || |
 n|
|< |
|< q|	jsq||  qd S )
Nr{   r^   r|   r}   r  c                   s    | v s |v rk |  |}|d us#J | \}}|j}|d  |d   ||u sAJ  ||u sJJ | rS| |rTq ||   | v s |v sd S d S rt   )r  r  r   rd   r  r,  r)  )r{   r|   r4  	node_key1	node_key2
is_speedupr  r:  rh   ri   rj   resolve_pending_fusions  s$   z<Scheduler._try_fusion_pairs.<locals>.resolve_pending_fusions)rd   r{   r|   rf   r  )r  r  r  r   r,  r#  rd   rz   rf   r  r  r   rc   r)  )rh   r8  r:  r<  r  r  rA  r{   r|   
fusion_resr4  template_pw_noderi   r@  rj   _try_fusion_pairs  sJ   






zScheduler._try_fusion_pairsc                 C  s|   t  }| D ]4}| \}}|j}||v st||rq|| | ||u s*J | ||u s3J | |||| qd S rt   )r   r   r   rd   r  r  r  r-  )rh   r  r:  seen_pair_speedup_fnr4  r=  r>  is_speedup_fnri   ri   rj   _finish_pending_fusions=  s   

z!Scheduler._finish_pending_fusionspossible_fusionsdeferred_prologue_fusionsc                 C  sX   t dd |D }g }|D ]\}}t||r"||v r"|||f q|||f q|S )Nc                 S  s   g | ]\}}t ||r|qS ri   r  )r   n1n2ri   ri   rj   r   ^  s    z6Scheduler._handle_template_overlap.<locals>.<listcomp>)r   r~  r   )rh   rH  rI  epilogue_template_nodesnew_possible_fusionsrJ  rK  ri   ri   rj   _handle_template_overlapU  s   z"Scheduler._handle_template_overlapc           	      C  s   |  | t|}ttjr!td |D ]
}td|  qi }i }g }| ||}t	j
s3t	jr9| ||}| ||||| | || | || |  |rc| ||||| | || t|dd d}| |}|S )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %sc                 S  r  rt   rt  r  ri   ri   rj   rY    r  z+Scheduler.fuse_nodes_once.<locals>.<lambda>r  )r  r   r  r  r  r  r  r  get_possible_fusionsr'   r  r  rN  rD  rG  r7  clearr  r  )	rh   r  r  r  r   r:  r<  rI  rH  ri   ri   rj   r  i  sR   

	
zScheduler.fuse_nodes_oncer  c           
        s^  t | j}d}t| j}td| tt| D ]r\}}t|}t|dk r)q|dur3||kr3 nY| 	|s?td| q|d7 }t
jdk}t|d j|d|d td	t|| |D ]}|| q^|  | j fd
d  D  | j|d }	|	dur|	| j < qt|dd d| _| | j| _td||t| j | | j dS )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r   Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                   r%  ri   r`  r  rn  ri   rj   rb    rc  z7Scheduler.create_combo_kernel_nodes.<locals>.<dictcomp>c                 S  r  rt   rt  r  ri   ri   rj   rY    r  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  r   r|  r  r  r  r  r  speedup_by_combo_kernelr'   r  r  r  r  r  r  r  r   r  r  r  r  r  )
rh   r  r  r  num_nodes_orignumr  r  r   streamri   rn  rj   r    s^   






r  c                 C  s   |D ]}| | j qd S rt   )r  r  )rh   r  r   ri   ri   rj   r    s   zScheduler.prune_redundant_depsc           
        s   g t tttf   d fdd}tt}|D ]}|r$q| D ]	}|| | q(q|	 D ]}|| q7t
jrett}|D ]}t|dd}	|	rY||	 | qH|	 D ]}|| q^jjd	d
 tdt S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        r  r  r}   r  c                   s   t | D ]E\}}| |d |d tj  D ]3}||f}|v r q| || r2| q| s:| rH|| rH||f qqd S r  )r  r'   )max_fusion_buffer_group_pairwise_attemptsr  r   r   r  r  )r  node1_indexr{   r|   r  r  rH  rn  rh   ri   rj   check_all_pairs  s*   
z7Scheduler.get_possible_fusions.<locals>.check_all_pairsr   NT)r  reversezfound %d possible fusionsr  r  r}   r  )r   r   r^   rW  r   r   unfusable_noder   r   r   r'   aggressive_fusionrx  *get_possible_fusions_with_highest_priorityr2  score_fusion_keyr  r  r   )
rh   r  r  rX  buffer_names_groupingr   r   node_groupinggroup_groupingr   ri   rW  rj   rO    s6   




zScheduler.get_possible_fusionsc                   s   t t  d fdd| j | j B |jj |jj B   tfdd D }|rAt||d	 |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        r   r^   r}   rl   c                   s^   t | tr-| vr-|  |   rdS t| j@ p,tfdd| j  D S dS )NFc                 3      | ]
} j | V  qd S rt   ru  r  
found_pathrh   ri   rj   r   :  
    
zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>)r   r   r  r   issubsetrl   r   r   r   combined_ancestorscombined_namesrd  rh   visitedri   rj   rd  )  s   

z6Scheduler.will_fusion_create_cycle.<locals>.found_pathc                 3  rb  rt   ru  r  rc  ri   rj   r   H  rt  z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>zwill create cycleNr   )r   r   r   _dictr   r   r   r  )rh   r{   r|   cycleri   rg  rj   r,    s   
z"Scheduler.will_fusion_create_cyclec              	     s   ddl m  dfdd}||}||}t fd	d
|D }t fdd
|D }||}d}	|D ]}
z
|	t|
d 7 }	W q4 tyK   Y  dS w ||}tjj	
|	d| r^dS dS )a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   buffer_reuse_keyr   r^   r}   list[ir.Buffer]c                   sL   g }| j jD ]} j|j}|r#t|jdkr#|j r#|	|j q|S r  )
r   r   r*  r  r   r   r  r   has_tensor_outputr   )r   r^  r  r   rg   ri   rj   _find_single_user_inputsd  s   zKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputsc                 3  rc  rt   ri   ra  rm  ri   rj   r   r  r  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>c                 3  rc  rt   ri   ra  rm  ri   rj   r   s  r  r   r   FrO  TN)r   r^   r}   ro  )r  rn  r   intersectionr   rq  r  rY   r   r   statically_known_gt)rh   r{   r|   rq  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingri   )rn  rh   rj   can_fusion_increase_peak_memoryM  s$   
z)Scheduler.can_fusion_increase_peak_memory	thresholdc                 C  s  t dd | D dd | D  }t dd |jjD }t dd |jjD }||@ }t  }|jjD ]}	| |	j|rD||	j q5t dd |jjD t dd |jjD B }
t d	d |jjD t d
d |jjD B }|
| }|| }||B }t||kS )Nc                 S  r  ri   r`  r  ri   ri   rj   r     r  zFScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<listcomp>c                 S  r  ri   r`  r  ri   ri   rj   r     r  c                 s  r  rt   r  r   ri   ri   rj   r     r  zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>c                 s  r  rt   r  r   ri   ri   rj   r     r  c                 s  r  rt   r  r   ri   ri   rj   r     ru  c                 s  r  rt   r  r   ri   ri   rj   r     r  c                 s  r  rt   r  r   ri   ri   rj   r     ru  c                 s  r  rt   r  r   ri   ri   rj   r     r  )	r   r   r   ry  r   $can_buffer_be_removed_through_fusionr   r  r   )rh   r{   r|   r|  fused_node_namesnode1_write_namesnode2_read_namesreads_removed_through_fusionwrites_removed_through_fusionr\  all_read_namesall_write_namesunique_readsunique_writesunique_io_buffersri   ri   rj   (fusion_prevent_too_many_reads_and_writes  s:   z2Scheduler.fusion_prevent_too_many_reads_and_writesc                 C  s*   t t|j|j t|j|j }|dkS )aA  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heuristic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )rv  r  rH  rI  )rh   r{   r|   proximity_scoreri   ri   rj   are_long_distant_nodes  s
   z Scheduler.are_long_distant_nodescommon_buf_names!tuple[str, ...] | OrderedSet[str]c                 C  sb  i }dd |j  D }dd |j  D }|D ]}tj|}|| }	|| }
t|	tr2t|
tsAdt|	 dt|
 ||< q|	 |
 krXd|	  d|
  ||< qt	|	j
t	|
j
krgd||< q|	 }|
 }||kr~d| d| ||< q|	 |
 krd	|	 d|
 ||< qd
}t|tjsd|j }d|	 d|
 d| ||< qt|S )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        c                 S     i | ]}|j |qS ri   r  r   ri   ri   rj   rb    r  z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>c                 S  r  ri   r  r   ri   ri   rj   rb    r  znot MemoryDep: r   zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r  zLayout: zUnknown reason: z. )r   r  rY   r   rG  r   r3   r   r   rX   r   
get_offsetnormalize_with_stride_orderr*   rJ  r  r   )rh   r{   r|   r  reasonsnode1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strri   ri   rj   decide_fusion_fail_reason  sD   
z#Scheduler.decide_fusion_fail_reasonc                 C  s  t jsdS tdd ||fD rdS |j }|j }||@ }|s$dS tdd |jD }|| r4dS t|dkr<dS t|jjdksLt|jj	dkrNdS t
t|jj}t
t|jj	}t|trht|tsjdS dd |jj	D }	|j|	vrzdS |	|j }
t|
tsdS |
 }
|
j|jkr|
j|jkrdS |j|jkst|jdkrdS t|jjdkrdS |jjrdS d	|jjv rd
|jjv sJ tdd |j D }t|dkrdS t
t|}||jjd	 krd	}d
}n||jjd
 ksJ d
}d	}ddlm} |jjd }t|dkrdS g }tj|D ]}|tjj !| qt"|}|||d }|du r7dS |jj| |jj|< ||jj|< |#dd | $||}t|t%s[J t&'d| |S )aW  
        Attempts to enable fusion between two nodes by inverting indexing patterns.

        This optimization targets cases where node1 has a contiguous write and
        node2 has a contiguous write but discontiguous read. By inverting the
        indexing in node2's read and write operations, we can make them compatible
        with node1 for potential fusion.

        Args:
            node1: First scheduler node (source)
            node2: Second scheduler node (target for inversion)

        Returns:
            int: Fusion score if successful, 0 if optimization not applicable
        r   c                 s  r  rt   r  r  ri   ri   rj   r   %  r  zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>c                 s  r  rt   r  r   ri   ri   rj   r   1  ru  r   c                 S  r  ri   r  r   ri   ri   rj   rb  F  r  zBScheduler.shared_data_after_inverting_indexing.<locals>.<dictcomp>r   index0index1c                 s  s    | ]}|V  qd S rt   ri   )r   ra  ri   ri   rj   r   l  rE  r   )generate_inverse_formulaNTFz!Shared memory after inversion: %d)(r'   r  r   r   buffer_namesr   rV  r   r   ry  r  r[  r   r3   r   r  r   r   	var_namesr   r   	subblocksget_read_exprs$torch._inductor.invert_expr_analysisr  varsr   Add	make_argsr   rY   r   r   combine_modular_indexing_pairsr   r  r  r   r  r  )rh   r{   r|   node1_buffer_namesnode2_buffer_namescommon_buffer_namesnode2_unmet_dependencies
node2_readnode2_writenode1_writesnode1_writenode2_read_exprs	read_exprread_expr_indexwrite_expr_indexr  rB  simplified_termstermsimplified_read_exprinverse_formulascoreri   ri   rj   $shared_data_after_inverting_indexing  s   

 



z.Scheduler.shared_data_after_inverting_indexingc                 C  s  t jrtdd ||fD rdS | s| rdS |j }|j }||@ }|s,dS dd |j D }dd |j D }g }|D ]#}	||	 }
||	 }|
 | krg|t	j
jj|
 dd|
|f qDt|dkrpdS t|tdd	\}}
}t|
trt|tsdS |
j|jkr|
 | kr| |
S dS d
}| s||
|}n| s|||
}ntd| |  |rtt| ||S dS )a  
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.

        Return the amount of shared data re-computed in this method.
        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
        amount of shared data).

        c                 s  r  rt   r  r  ri   ri   rj   r         
z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>r   c                 S  r  ri   r  r   ri   ri   rj   rb    r  z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>c                 S  r  ri   r  r   ri   ri   rj   rb    r  r   r   r  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s) r'   r  r   r  r   r  r  r  r   rY   r   r   r   r   r   rv  r  r  r   r3   r4  r  dep_size_hintr   r  r7  r  r  r  r  r   r  )rh   r{   r|   r  r  r  r  r  
candidatesr   r  r  _numel	reorderedri   ri   rj   !shared_data_after_reordering_loop  sj   



z+Scheduler.shared_data_after_reordering_loopc                 C  s^   t |tr|  ot|j S t |tr-t |jtjr"|j  S |  o,t|j S dS )z>
        Is this node unfusable under any conditions.
        F)	r   r  r  rV   r   rQ  r*   r  r  re  ri   ri   rj   r[    s   

zScheduler.unfusable_nodeprologue_noder  r  r  c           
      C  s   |  tjjkr
dS | }| }d}||| kr |d dS tdd | D }|tj	j
jjfkr:|d dS ddd}| }	|	 sV||	jrV| sV|d dS dS )zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc                 s  s:    | ]}|j d ur|j  D ]}|jdkr|jV  qqd S )Ncall_function)r   r#  r	  r$  r   r  r   ri   ri   rj   r   +  s    

zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>z\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsr|  torch.dtyper}   rl   c                 S  s   | j dko| jS )Nr   )itemsizeis_floating_point)r|  ri   ri   rj   low_prec_fp8  r  zGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fpzVprologue fusion that must be upcast to fp32 not profitable for low precision templatesN)r|  r  r}   rl   )r   rY   r   invoke_quant_opsr6  r7  r   r   r  r  r  constant_pad_ndrS  r  rU   r|  r  )
rh   r  r  r  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr)  r  template_bufri   ri   rj   (check_prologue_fusion_heuristics_fusable  s:   

z2Scheduler.check_prologue_fusion_heuristics_fusable,tuple[int, SchedulerNode, sympy.Expr] | Nonec                   s  t |tr
t |tsdS t |jtjrt |jtjsdS | s$| r&dS tjdkr-dS |j|j}}|\}}|\}}|	 sP|	 sP||ksPt
|t
|krRdS t
|jjdksbt
|jjdkrddS  tt|jj}	 tt|jj}
t|	|
tjkrdS d fdd	}||s||rdS g }tt||D ]\}\}}||kr|| qt
|dkrdS |d
 }|| || }}tjj||r|||fS tjj||r|||fS dS )ao  
        Fusing two small pointwise nodes significantly reduces kernel overhead
        and launch overhead. However, slightly different sizes would prevent fusion.
        Here, we decide if expanding sizes of one node is profitible by allowing
        fusion, and returns the dimension to expand, node with smaller sizes,
        and new size after expand.
        Nr  r   r   r^   r}   rl   c                   s`   | j jD ])}|j jv r j|j }n j|j}|r-tjj	|| r-t
|jts- dS qdS r+  )r   r   r   r)  r*  r  rY   r   r$  r  r   r  r  )r   r  r  rg   ri   rj   has_reusable_buffer  s   
zIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_bufferr   r   )r   r   r   r*   r   r  r'   r  r  r   r   r   ry  r  r  r[  rv  small_memory_access_thresholdr  rH  r   rY   r   r   statically_known_lt)rh   r{   r|   n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryr  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2ri   rg   rj   "get_expand_dim_for_pointwise_nodesH  s`   
 


z,Scheduler.get_expand_dim_for_pointwise_nodesFTcan_reorderr  c                   s|  u rdS |   r$| j}| j}|dur$|dur$||kr$dS ttr.S ttr5dS t} rK| 	 
rKdS ttsUttr[|d dS ttrj sj|d dS ttr"tjtjs}|d dS j s|d dS tts|d dS tjts|d	 dS tjjts|d
 dS tjjdksJ jjd jtfddjjD r|d dS jj }|D ]}	jj|	}
tdd |
D r dS qtjjdksJ jjd jjjkr
|d dS d+fdd t fdd| j D r"dS tttfr5 s5|d dS ! j"@ rC|d dS  rt#sS|d dS $ s] rc|d dS % }|& }|st|d dS t'dd |j(D | }) |@ r|d dS * s* r|d dS + dd D ]"}|, }|D ]}t-fd d|j.D s|d!   dS qqtt/sӈgnd"d# j0D }t|dksJ |d }td j1dkrtd j1d j.dkrd j1d j.d j|u s|d$ dS | 2|sdS  rU* s1$ s1t3s7|d% dS 4 }|dusBJ |5 rUtjtjsU|d& dS ) t6j7j8@ sg) t6j7j8@ rm|d' dS 	 }	 }||kr|d(|| dS ~| j9|d)}t|t:sJ |r|t;j<k rt;j=r| >}|dkr|}t;j?r| @ }r|\}}}|A|| | 9}t|t:sJ t;jBr|t;j<k r| C}|dkr|}tDEtFjGrtDHd*I I | t6jJK| |s	dS ! j"@ r+| Lo*t6jJL| |o*| |LS t6jJM| |o=| |MS ),zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FNTz/grouped node must not be fused with other nodesznode1 is nopz'node1 is extern but not a triton kernelz5node1's triton kernel doesn't support epilogue fusionz.node1 is extern but node2 is not SchedulerNodez3node1 is extern but node2.node is not SchedulerNodez4node1 is extern but node2.node.data is not Pointwiser   r   c                 3  s    | ]}|j  kV  qd S rt   r  r   )written_buffer_nameri   rj   r     r  z%Scheduler.can_fuse.<locals>.<genexpr>z9epilogue reads from buffers other than the mutated outputc                 s  s    | ]}|d kV  qdS )r  Nri   )r   usageri   ri   rj   r     r  z*node1 and node2 uses different buf layoutsr   r^   c                   s   |  uo| uo|   v S rt   r  )r   )r{   r|   r  ri   rj   ._is_other_node_that_references_mutation_buffer   s
   
zJScheduler.can_fuse.<locals>._is_other_node_that_references_mutation_bufferc                 3  rc  rt   ri   r  )r  ri   rj   r   	  r  znode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz'template has no allowed prologue inputsc                 s  r  rt   r`  )r   r  ri   ri   rj   r   *  r  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesr   c                 3  s    | ]}|j  v V  qd S rt   r   r@  )prologue_nodesri   rj   r   :  r  z7template prologue can only fuse nodes with a single usec                 S  r  ri   r  r  ri   ri   rj   r   A  r  z&Scheduler.can_fuse.<locals>.<listcomp>zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz6multi-output template epilogue requires ComputedBufferz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r  z%s and %s has %s shared data)r   r^   )Nr  r  r  r   r  r  r  r  r
  r   can_fuse_multi_outputs_templaterw  r  rQ  r   r*   r  r  r   r   r   r   r   mutation_outputsr   r   r   r   inner_fn_free_symbolscollect_inner_fn_symbol_usager  r  r  r   r   r{  r   r  get_allowed_prologue_inpsr   r  r  r  r   rd  r   r  r   r   rO  r  rw  r  rU   rY   r   no_fuse_buffer_namesr  r   r'   score_fusion_memory_thresholdr  r  $expand_dimension_for_pointwise_nodesr  r2  r  r  r7  r  r  r  r  r  r  r   can_fuse_verticalcan_fuse_horizontal)rh   r{   r|   r  r  r(  stream2r  node2_inner_fn_free_symbolssymbolusagesr  r  unsupported_prologue_argsr   	node_outsr   template_snodestemplate_snoder  r  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizeri   )r  r{   r|   r  r  rj   r     s~  





	








zScheduler.can_fusec                 C  sd  |  }t||}tt}|jD ]}| j|j|j}t|t	r(| 
|||r(q|| | q|jjD ]C}t|tsAt|tsAq4|| j|j|j}	|	rw|	D ]%}
t|trd| |
|rd|	|
 qQt|trv| |
||jrv|	|
 qQq4tdd tj| D }||@ r|d dS | }|D ]}| j|  }|| j| j@ r|d  dS qdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c                 s  r  rt   r  r   ri   ri   rj   r     r  z.Scheduler.can_fuse_vertical.<locals>.<genexpr>zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r  r  r   r   rV  rL  r  r   r   r5   r  r   r   ry  r3   r4   fusable_read_and_writer  .fusable_stardep_write_and_read_on_empty_tensorr   r   r  r  rI  r   r   r*  r
  r  r   )rh   r{   r|   node1_buf_namesr  remaining_deps_by_namer   r   cd	remainingr  remaining_depsnode1_op_namesr  ri   ri   rj   r    sV   



zScheduler.can_fuse_verticalweak_depr5   c           	        s   j | vr	dS fdd|jjD }t|dkrdS |d ttr'dS tts.J tj	t
jr7dS tjj	jksBdS | jj  |g}t|trS|j}d}|D ]"} fdd|jjD }|sgqW|d7 }tfdd|D sy dS qW|dkS )	NFc                   s   g | ]
}|j  jkr|qS ri   )r   r"  )r   write)r	  ri   rj   r     r  z.Scheduler.fusable_weak_dep.<locals>.<listcomp>r   r   c                   s   g | ]	}|j  kr|qS ri   r  r   r  )r  ri   rj   r     s
    
c                 3  sB    | ]}t |tot|jtj o|j jko|j jkV  qd S rt   )r   r3   r"   r   r$   TMPr   r  )r
  ri   rj   r     s    



z-Scheduler.fusable_weak_dep.<locals>.<genexpr>)r   r  r   ry  r   r   r4   r3   r"   r   r$   r  r   r  r!   r  r"  r  r   r   r   )	rh   r	  r{   r|   mutating_writesrelevant_reading_nodesnum_concurrent_readsreading_noderelevant_readsri   )r  r	  r
  rj   r    sB   



zScheduler.fusable_weak_depr  r2   r
  r3   c                 C  s  t |trY| j|j|j}||jks!t|jtjs!t|jtjr#dS t	j
r4|j|jkr4| }| }| |jr<dS |j|jkoXt|jt|jkoX|jd t|j |jkS t |tr| j|j|j}| j|j|j}|j|jkr|jd ur||krdS dS r/  )r   r3   rL  r  r   r"   r   r$   r  r'   r  r4  r  rk  r_  r   r   r4   )rh   r  r
  	read_name
write_nameri   ri   rj   r    s4   



z Scheduler.fusable_read_and_writer4   writing_noderM  c                 C  sZ   t |tjsdS | sdS | j|j|j}| j|j|j}t |tr+||kr+dS dS r/  )r   r*   r  r  rL  r  r   r4   )rh   r  r
  r  r  r  ri   ri   rj   r  F  s   z8Scheduler.fusable_stardep_write_and_read_on_empty_tensorr   r  c                 C  s   t j||S rt   )rY   r   get_dep_size_hint)rh   r   r  ri   ri   rj   r  S  r:  zScheduler.dep_size_hintreturn_is_mix_order_reductionint | tuple[int, int, bool]c                   s  fdd}|rt |rt |}||ddS t|jtjr&|j s.| s. rg|j	j
|j	jB }j	j
j	jB }	ddd	}
d}|D ]}|	D ]}|
||r_|t||7 }qKqG||dd
S t|j	j
t|j	j }tj	j
tj	j }t||d t||k r||kr|}fdd|j	j
|j	jB D }|t fdd|D dd
S |j	j
|j	jB j	j
j	jB @ }tfdd|D }d}|dkr|r|}|||d
S )a4  
        The first term in our fusion score that estimates number of saved
        memory operations.

        This function scores fusion candidates based on shared memory access patterns.
        Higher scores indicate better fusion candidates.

        Scoring strategy:
        1. If nodes share exact memory deps (same buffer + same indexing), return
           the sum of shared dep sizes (original behavior).
        2. If no exact matches (score == 0), check for same-buffer reads with
           different indexing (e.g., split operations reading different slices).
           - Give bonus if nodes read from exactly the same set of buffers
           - Score based on overlap ratio: common_buffer_size / total_read_size
           - High overlap (>50%) suggests good cache locality benefit from fusion
        c                   s    r| ||fS | | S rt   ri   )r  buffer_overlap_scoreis_mix_order_reduction)r  ri   rj   _construct_return_valueo  s   
z>Scheduler.score_fusion_memory.<locals>._construct_return_valuer   Tdep1r2   dep2c                 S  s8   | |krdS t | ttfrt |ttfr| j|jkS dS r+  )r   r4   r3   r   )r  r  ri   ri   rj   _match  s   z-Scheduler.score_fusion_memory.<locals>._matchFr  c                   s(   g | ]}| j jv s| j jv r|qS ri   )r   r   ry  r   )r|   ri   rj   r     s
    z1Scheduler.score_fusion_memory.<locals>.<listcomp>c                 3  s    | ]	} | V  qd S rt   r  r   )r  rh   ri   rj   r     r  z0Scheduler.score_fusion_memory.<locals>.<genexpr>c                 3  s    | ]}  |V  qd S rt   r  r   rg   ri   rj   r     r  N)r  r2   r  r2   )r   r   r   r   r   r*   r  r  r  r   r   ry  rv  r  r   rL  r   _can_use_buffer_overlap_scoring&_score_fusion_memory_by_buffer_overlap)rh   r{   r|   r  r  r  r  r  
node1_deps
node2_depsr  	node1_dep	node2_depnode1_dep_lennode2_dep_lenr  common_memory_depsr  ri   )r  r|   r  rh   rj   r  V  s\   

	


zScheduler.score_fusion_memoryc                 C  s  |  s|  r
dS | s| rdS tjstjr| }| }|r&|s(dS tdd |D }tdd |D }t }|D ]<}|jD ]6}	t|	j	t
rz|	j	 rzt|	j	rz|	j	 }
|
durtt|
tjrt|
 }||@ rs||	j	 qD||	j	 qDq?|r|D ]8}|jD ]2}	t|	j	t
r|	j	 r|	j	|v r|	j	 }
|
durt|
tjr|
 }||@ r  dS q  dS qq||fD ] }|jjD ]}| j|j}|dur| rt|r  dS qqdS )a@  
        Check if buffer overlap scoring should be used for this node pair.

        Buffer overlap scoring handles split/cat patterns where nodes read from
        the same buffer at different indices. We skip it when:
        - Either node is a reduction (different memory access patterns)
        - Either node is a template
        - Both nodes are prologue/epilogue candidates for the same template,
          because horizontal fusion would prevent them from being absorbed
          into the template kernel. For example, in:
            q = a[:64, :]; k = a[64:, :]
            return mm(q + 2, k - 2)
          "q + 2" and "k - 2" both read from `a` and would get a high overlap
          score, but fusing them horizontally prevents prologue fusion into mm
          (resulting in 2 kernels instead of 1).

        We allow buffer overlap scoring when:
        - The node outputs are not actually in the template's allowed_prologue_inps,
          meaning they can't be prologue-fused anyway, so horizontal fusion doesn't
          prevent any optimization opportunity.
        FTc                 s  r  rt   r`  ra  ri   ri   rj   r     r  z<Scheduler._can_use_buffer_overlap_scoring.<locals>.<genexpr>c                 s  r  rt   r`  ra  ri   ri   rj   r     r  N)r   r  r'   r  r  rd  r   r  r   r   r^   r{  r  r*   r  r  r  r   r   r  r  r   rw  )rh   r{   r|   node1_outputsnode2_outputsnode1_output_namesnode2_output_names&node1_prologue_eligible_template_usersr   r  r  allowed_inpsr   r   r  ri   ri   rj   r    s~   








	z)Scheduler._can_use_buffer_overlap_scoringc                   s   d d fddt dd	 |jjD }t d
d	 |jjD }||@ s(dS tfdd	|jjD }tfdd	|jjD }t||}|dkrMdS tfdd	|jjD }tfdd	|jjD }	t||	}
|
| }|tjkry|
S dS )a8  
        Score fusion based on buffer name overlap when exact dep matching fails.

        This handles the split/cat fusion case where nodes read from the same buffer
        but at different indices (e.g., different slices from a split operation).

        Scoring logic:
        - If nodes read from exactly the same buffers: high bonus (encourages fusion)
        - For common buffers: score based on overlap ratio
          - overlap_ratio = common_buffer_size /
            max(node1_total_reads, node2_total_reads)
          - If overlap_ratio > threshold (e.g., 0.5): give proportional score
          - If overlap_ratio < threshold: minimal/no score (not worth fusing)

        Note on dynamic shapes:
        - When deps have unbacked symbols (dynamic shapes), dep_size_hint returns 0
        - In this case, we use count * 10 as a proxy for size
        - This ensures fusion still works for models with dynamic batch sizes

        Note on multiple deps from same buffer:
        - A node may have multiple MemoryDep entries for the same buffer name
          (e.g., 4 split reads from arg0_1 at different indices)
        - We sum ALL dep sizes for each buffer, not just take max
        - This ensures overlap ratio is calculated correctly when nodes read
          multiple slices from the same underlying buffer
        r  r   r2   r}   r   c                   s    | }|dkr|S  S r   r  )r   r   )FALLBACK_DEP_SIZErh   ri   rj   get_dep_sizeK  s   
zFScheduler._score_fusion_memory_by_buffer_overlap.<locals>.get_dep_sizec                 s  r  rt   r  r   ri   ri   rj   r   O  r  zCScheduler._score_fusion_memory_by_buffer_overlap.<locals>.<genexpr>c                 s  r  rt   r  r   ri   ri   rj   r   P  r  r   c                 3  rc  rt   ri   r   r/  ri   rj   r   Y  r  c                 3  rc  rt   ri   r   r0  ri   rj   r   \  r  c                 3  "    | ]}|j  v r|V  qd S rt   r  r   common_namesr/  ri   rj   r   g      
c                 3  r1  rt   r  r   r2  ri   rj   r   l  r4  N)r   r2   r}   r   )r   r   r   r   rv  r'   min_overlap_ratio)rh   r{   r|   node1_read_namesr  node1_total_read_sizenode2_total_read_sizemax_total_read_sizenode1_common_read_sizenode2_common_read_sizecommon_read_buffer_sizeoverlap_ratiori   )r.  r3  r/  rh   rj   r   +  s8   

z0Scheduler._score_fusion_memory_by_buffer_overlapc                 C  s   t |dkr|S i }|D ]2\}}| | ksJ | }t| |||}||vr5||fg||< q|| ||f qt| t	ddd }t |dksTJ |S )Nr   r  r   )
r   r   r   r
  get_fusion_pair_priorityr   rL  r  r  r  )rh   rH  "possible_fusions_group_by_priorityr{   r|   r  fusion_pair_priority&possible_fusions_with_highest_priorityri   ri   rj   r]    s.   
z4Scheduler.get_possible_fusions_with_highest_priorityr~   r   c                 C  s   t jj| g|R  S )z-
        Shim for list.sort(key=...)
        )rY   r  score_fusionr  ri   ri   rj   r^    s   zScheduler.score_fusion_keyc                 C  s<   t tj }t| jD ]}||| j ||j	 qdS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rY   r   r  r   r  r  r  r  rE  )rh   r  r   ri   ri   rj   r    s
   zScheduler.compute_last_usagec                 C  s   t | jtjj tjjj D ]T}|| jv r'| j| }| r&tjj	|j
 q|tjjv rbtjj| }t|tjrAtjj	| qt|tjtjfrKq|j}t|tjrX| sZJ tjj	|j q| j  dS )z*Free any buffers that are no longer neededN)r  r  rY   r   r  r$  freedr*  r2  codegen_freer   rX  r   r*   rJ  r  r  r   r  is_input_bufferrP  )rh   r   r   r  storageri   ri   rj   free_buffers  s4   


zScheduler.free_buffersc                 C  s$   | j  D ]}|  q|   d S rt   )r  r   flushrG  )rh   r  ri   ri   rj   rH    s   
zScheduler.flushscheduler_nodec                 C  s   t |ttfs	J td d  d7  < ttdd |  |  W d    n1 s.w   Y  |	tj
j |   d S )Nr]  extern_callsr   F)increase_kernel_count)r   rQ  r  r   rY   set_kernel_handlerr/   r  r  r  r   r$  rG  )rh   rI  ri   ri   rj   codegen_extern_call  s   
zScheduler.codegen_extern_callBaseSchedulingc                 C  s   t |jr|jd usJ | dtj| t|j}|d u r(td|j t sR|jdkrBt	j
| }jdk rBt|t t |jrR|jdksRtt || S )Nz( should have been normalized in loweringzUnsupported device type: r      r  )rT   r   r   rY   r   add_device_infor.   rv  r%   r  r   get_device_propertiesmajorr6   inspectcurrentframer7   )rh   r  device_schedulingrM  ri   ri   rj   create_backend  s   

zScheduler.create_backendc                 C  s0   |d usJ || j vr| || j |< | j | S rt   )r  rV  r  ri   ri   rj   r
    s   

zScheduler.get_backendc                   s`   dfdd  fdd|  D }t| }|r.t|td	d
\}}tjj	| d S d S )Nr  torch.fx.Noder}   r   c                   s2   |  j vr j dd t| jjD   j |  S )Nc                 S  r#  ri   ri   r  ri   ri   rj   rb    rc  z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>)r  r  r  r   r  r  rg   ri   rj   	get_order   s   

z*Scheduler.enter_context.<locals>.get_orderc                   s4   i | ]}|j d ur|j  D ]	} ||fd qqS rt   r  r  )rX  ri   rj   rb    s    
z+Scheduler.enter_context.<locals>.<dictcomp>r   r  )r  rW  r}   r   )
r   r   r   rv  r  r  rY   r   r$  enter_context)rh   r   r)  r  lastri   )rX  rh   rj   rY    s   
zScheduler.enter_contextr   r~  rD  c                   sP   z| j | j}W n
 ty   Y dS w t fdd|D o'|| jvo'|| jvS )NFc                 3  s"    | ]}|j p|  v V  qd S rt   )r`  r  r@  r~  ri   rj   r     s     zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>)r*  r  KeyErrorr   rL  r  )rh   r   r~  r  ri   r[  rj   r}    s   z.Scheduler.can_buffer_be_removed_through_fusionc                 C  s8  |j }t|tjjjr.|j }r.t|\}}|tj	v s |tj	v r.t|tj
js)J d| S tjjjjs;tjdu r;dS t|trS|jD ]}| |}|rP|  S qCdS |j dusZJ | se|  dS t|j tjrndS t|j tjrwdS t|j ddrdS t|j rd	S | | }r|S tjjrt|rd
S dS )z
        Return the reason why we should partition the inductor graph on this node,
        or None if the node is cudagraphable.
        zcustom partition op: Nz6partition includes all ops when cudagraphs is disabledz opszDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opszdynamic shape ops)r   r   r  r  r*   r  rR  rO   r'   custom_should_partition_ops_ops
OpOverloadr   r]   rG   r  r   r   should_partitionrT   r   
DeviceCopyrp  rx  rS   &_uses_cudagraph_unsafe_unbacked_symintcudagraph_skip_dynamic_graphsrn  )rh   r   ro  r	  op_overload_packet_nameop_overload_namer  r  ri   ri   rj   ra    sJ   








zScheduler.should_partitionre  c                 C  s   t  }tjs|S | jD ]H}|j}|du rqt|tjjj	sq|j
}|du r&qt|\}}|tjvr7|tjvr7q| D ]}tjj|}t|tjtjfrR|| q;q|S )zc
        Collect output unbacked symints from ops in config.cudagraph_unsafe_unbacked_ops.
        N)r   r'   cudagraph_unsafe_unbacked_opsr  r   r   r  r  r*   r  rR  rO   r(  rY   r   r   r   r#   r$   UNBACKED_INTUNBACKED_FLOATr  )rh   unsafe_symintsr   ro  r	  re  rf  symri   ri   rj   &_get_cudagraph_unsafe_unbacked_symints]  s.   



z0Scheduler._get_cudagraph_unsafe_unbacked_symintsc                 C  sZ   |   }|sd S t|}|D ]}tjj|}|jD ]}||v r)d|     S qqd S )Nz'uses cudagraph-unsafe unbacked symint: )rl  rn  rY   r   r   r   r!   )rh   r   rj  node_symbolsrk  simplified_symfree_symri   ri   rj   rc    s   
z0Scheduler._uses_cudagraph_unsafe_unbacked_symint6dict[str, ir.IRNode | ir.TorchBindObject | sympy.Expr]c                 C  s@   i }| tjj | jD ]}|j D ]	\}}|j||< qq|S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r  rY   r   rX  r  rQ  r  r   )rh   r  r   r   scheduler_bufferri   ri   rj   get_name_to_nodes  s   
zScheduler.get_name_to_nodes
signatureslist[GraphPartitionSignature]c           
      C  s   dd t tjjD }dd t tj D }g tj_t |D ]7\}}|jr'qg }|jD ]
}||	| q,g }|j
D ]}	||	|	  q<tjjt||||j qdS )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        c                 S  r#  ri   ri   r   r  r   ri   ri   rj   rb    rv  z:Scheduler.compute_graph_partition_maps.<locals>.<dictcomp>c                 S  r#  ri   ri   ru  ri   ri   rj   rb    rv  N)r  rY   r   rX  r  partition_mapsskip_cudagraphinput_nodesr   r  output_nodesr  rP   constant_names)
rh   rs  name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingr   output_mappingr   ri   ri   rj   compute_graph_partition_maps  s2   


z&Scheduler.compute_graph_partition_maps	partitionr_   rx  c                   s   ddd ddd	}t  jd
d |D  }|j fdd| D   ||}t  }|D ]}tjj|}||j q,t t	|t
ddS )ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        r   +ir.IRNode | sympy.Expr | ir.TorchBindObjectr}   re  c                 S  s8   t | tjr	t S t | tjrt| S tdt|  )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )r   r*   rJ  r   rk  rj  r  r   r   ri   ri   rj   get_input_node_symbols  s
   zKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbolssymbolsc                 S  s   t dd | D S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c                 s  s.    | ]}t |tjtjtjtjfr|V  qd S rt   )r#   r$   SIZEFLOATrh  ri  r  ri   ri   rj   r     s    
zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>r   )r  ri   ri   rj   filter_symbols  s   zCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbolsc                 s  rl  rt   rm  r  ri   ri   rj   r     r  z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>c                 3  s    | ]	\}} |V  qd S rt   ri   )r   r  r   r  ri   rj   r     r  r   r  N)r   r  r}   re  )r  re  r}   re  )r   rq  r  r  rY   r   r   r   r!   r  r  
attrgetter)rh   r  rx  r  candidate_symbolsr  r;  symplified_sri   r  rj   !get_graph_partition_symbol_inputs  s   

z+Scheduler.get_graph_partition_symbol_inputs
partitionslist[PartitionType]skip_cudagraphs
list[bool]c                   s  g }t tj } dfddtt|t|D ]\}}t  }|D ]
}||j	  q'|
|}	tjdd |D }
t d	d |
j|
jB D | }t fd
d|D }t   |D ]} |j qafdd | D }|| fdd|D } fdd|D } fdd|D }|	| t fdd|	D }	fdd|	D }dd |D }||}t||||||}|| |||	 }q|ddd S )z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        r  r   r}   rl   c                   sJ   j | d}|du rdS t|jjtr#j| d }r! |S dS dS )z
            Checks if buf_name resolves to a NoneLayout buffer (following mutation_real_name).
            Buffers with NoneLayout are not allocated so graph partition should not
            take them as inputs or outputs.
            NFT)r*  r  r   r   r  r>   r  )r  r   r  )is_unallocated_bufferrh   ri   rj   r    s   zFScheduler.get_graph_partition_signature.<locals>.is_unallocated_bufferc                 S  ri  ri   rj  r  ri   ri   rj   r   >  rk  z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>c                 S  s   g | ]
}t |ts|jqS ri   )r   r5   r   r  ri   ri   rj   r   E  s    c                 3      | ]
} j ||V  qd S rt   r  r  r  rg   ri   rj   r   N  re  z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>c                   s   g | ]}| v r|qS ri   ri   r  r  ri   rj   r   Z  s
    c                   r  ri   ri   r  r  ri   rj   rb  a  r  z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>c                   s   i | ]}|v r|| v qS ri   ri   r  r  r  ri   rj   rb  f  r  c                   s    g | ]}|v r| vr|qS ri   ri   r  r  ri   rj   r   p  
    c                 3  r  rt   r  r  rg   ri   rj   r   x  re  c                   s   g | ]
} |s| qS ri   ri   r  )r  r  ri   rj   r   }  s    c                 S  s   g | ]
}|t jjv r|qS ri   )rY   r   r  r  ri   ri   rj   r     r
  Nr   )r  r   r}   rl   )r   rY   r   r  rr  rH  r   r  rQ  r   rr  r)   ro  rp  r   ry  rE  r  r;   r   rq  )rh   r  r  rs  unmet_output_namesr  rw  output_namesr   returned_output_namesr   partition_input_namesextra_input_namesrx  input_deallocationextra_output_namesry  rz  symbol_inputspartition_signatureri   )r  r  r  rh   rj   get_graph_partition_signature  s   







	z'Scheduler.get_graph_partition_signaturer~  r;   c                 C  s^   dd |j  D }dd |j D }dd |jD }dd |jD }t|j||||j|S )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        c                 S  "   i | ]\}}|t jjvr||qS ri   rY   r   r  )r   r   r  ri   ri   rj   rb    
    zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<dictcomp>c                 S  r  ri   r  )r   r   r7  ri   ri   rj   rb    r  c                 S  s    g | ]}|  tjjvr|qS ri   )maybe_get_namerY   r   r  r  ri   ri   rj   r     r  zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<listcomp>c                 S  s   g | ]
}|t jjvr|qS ri   r  r  ri   ri   rj   r     r  )rx  r  r  ry  rz  r;   r  rw  )rh   r~  rx  r  ry  rz  ri   ri   rj   .clean_removed_buffer_from_partition_signatures  s(   z8Scheduler.clean_removed_buffer_from_partition_signaturesc                   s  ddl t g  g dd t|D d fd	d
dfdd}|D ]}t|jj|< | dkr=| q)g }d}|t|k rsL rr`\}}|| || sN rt \}}|| ||  sb|d7 }|t|k rsL sL|t|krtd|S )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                 S  r#  ri   ri   )r   r  r   ri   ri   rj   rb    rc  z>Scheduler.reorder_for_minimizing_partition.<locals>.<dictcomp>r   r^   r}   r  c                   s6   |  | f} | r| d S  | d S rt   )ra  heappush)r   node_with_index)cudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesrh   ri   rj   insert_pending_nodes  s   
zHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodesc                   sF   | j jD ]}| dksJ |  d8  < | dkr  | qd S r  )rJ  
succ_nodes)r   	succ_node)r  node_to_indegreeri   rj   update_indegree  s   zCScheduler.reorder_for_minimizing_partition.<locals>.update_indegreer   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                r   r^   r}   r  )	r  rG  r  r   rJ  
pred_nodesheappopr   rv  )rh   r  r  r   r  	num_itersr  ri   )r  r  r  r  r  r  rh   rj    reorder_for_minimizing_partition  sP   

z*Scheduler.reorder_for_minimizing_partitionc           
      C  sp   ddl m}m} ttj }||| j| jttjj	
 |\}}| |}||||\}}	||d k r6|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r   )estimate_peak_memoryprepare_planning_infor  )r  r  r  r   rY   r   r  r*  r  rX  r   r  )
rh   r  r  r  r,  default_peak_memoryr^  reordered_nodesreorder_peak_memoryr  ri   ri   rj   r    s    
z0Scheduler.maybe_reorder_for_minimizing_partitionc                 C  sz   g }g }g }d	dd}|D ])}|  |du}|r%t|jdkr%|| q|r1||r1|| q|| q|| | S )
a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        r   r^   r}   rl   c                 S  s2   |   D ]}|jD ]}t|jts  dS q	qdS r/  )rd  r  r   r   r0  )r   r   r1  ri   ri   rj   only_output_user.  s   
zPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_userNr   r   )ra  r   rV  r   )rh   r  frontmiddlebackr  r   ra  ri   ri   rj   r     s   

z6Scheduler.reorder_for_partition_with_simple_dependency9tuple[list[PartitionType], list[GraphPartitionSignature]]c                 C  s  g }d}g }g }| j D ]"}| |du}|r&||kr&|| || g }|}|| q|r:|| || tjj}|dkrktt||D ]!\}\}	}
|
sjtdd |	D }||k rjd||< t	
d||| qI| j||d}| | | || ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        TNr   c                 s  s    | ]
}t |tsd V  qdS r   N)r   r  r  ri   ri   rj   r   `  s    
z,Scheduler.graph_partition.<locals>.<genexpr>zFPartition %d has %d kernels, below minimum size %d, skipping cudagraph)r  r  )r  ra  r   r'   r   cudagraph_min_partition_sizer  rH  r   cudagraphs_logr  r  r  _log_graph_partitions)rh   r  rw  cur_partitionr  r   node_should_partitionmin_sizer  r  skipkernel_countrs  ri   ri   rj   r  @  sJ   





zScheduler.graph_partitionc           
   
   C  s   t tjsd S tdd tjjD }|sd S tdd |D }t	|| }t 
dt	||| tt||D ]*\}\}}t 
d|t	||jrIdndt	|jt	|j |jra|D ]}	| |	 qYq7d S )Nc                 s  rl  rt   )rT   )r   r  ri   ri   rj   r     r  z2Scheduler._log_graph_partitions.<locals>.<genexpr>c                 s  s    | ]}|j sd V  qdS r  )rw  r  ri   ri   rj   r     r  zCCreated %d graph partitions: %d cudagraphable, %d non-cudagraphablez3  Partition %d: %d nodes, %s, inputs=%d, outputs=%dznon-cudagraphablecudagraphable)r  r  r  r  r   rY   r   device_typesr   r   r  r  rH  rw  rx  ry  _log_non_cudagraphable_node)
rh   r  rs  has_gpu_devicecudagraphable_countnon_cudagraphable_countr  r  r~  r   ri   ri   rj   r  w  s6   zScheduler._log_graph_partitionsc           
      C  s   |  |}|s	dS | }|jdur|j nd}d| g}t|jj}|d|  |durK|j dddd |j	D  d}|d	|  t
d
|d| |durr|jdd}|rt| dD ]}	t
d|	 qidS dS dS )z)Log details for a non-cudagraphable node.Nzreason=zir=rl  r  c                 s  rl  rt   )r   r  ri   ri   rj   r     r  z8Scheduler._log_non_cudagraphable_node.<locals>.<genexpr>rg  zfx=z
    %s: %sr  ro  z         %s)ra  r  r   r_  r   ru   r   r$  r9  r  r  r  r%  r  stripsplit)
rh   r   r  r  rb  partsir_typefx_strr  lineri   ri   rj   r    s(   
$z%Scheduler._log_non_cudagraphable_nodec                 C  sL   t d tjjjr|  n| | j	 W  d    S 1 sw   Y  d S )NScheduler.codegen)r   r  r  r'   r  _codegen_partitions_codegenr  rg   ri   ri   rj   r    s   


$r  c           	      C  s   ddl m} tjj}t| j}tj B tjjdd| ||d | 	| t
tjj|s0J | |}|tjj_tjj  tjj}tjjtjj\}}W d   n1 sYw   Y  tjj|| tjj|| tjjjdd |jD  dS )	z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesNc                 S  r  ri   r`  r  ri   ri   rj   r     r  z8Scheduler._codegen_partition_wrapper.<locals>.<listcomp>)r  r  rY   r   r$  r  r  set_current_wrapper_codeinit_wrapper_coder  r   r  r  write_prefixr   generateis_inferencedefine_subgraph_launcher_fncodegen_partition_call	allocatedr  ry  )	rh   r  r~  r  r  graph_partition_id
graph_namepartition_coder  ri   ri   rj   _codegen_partition_wrapper  s.   





z$Scheduler._codegen_partition_wrapper'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )Nr}   Iterator[None]c                   3  s       jr#tjjr#jjd usJ dtjjjj zd V  W jr7tjjr7tjj	  d _d S jrKtjjrKtjj	  d _w )Ndevice should have an index)
%update_graph_partition_default_devicer  rJ   r   r   rY   r   r$  codegen_device_guard_entercodegen_device_guard_exitri   r  rh   rs  ri   rj   ctx  s.   
z1Scheduler.use_default_device_context.<locals>.ctx)r}   r  )
contextlibcontextmanager)rh   r  rs  r  ri   r  rj   use_default_device_context  s   z$Scheduler.use_default_device_contextc                 C  s   t |dkr|d jsd S ddd}ddd}d }t||D ]\}}|js+||} nq|d u r2d S t||D ]\}}|jrF|||sF d S q7|| _d S )Nr   r   r  r_   r}   r  c                 S  s   | d   }|d usJ |S r   r  )r  partition_deviceri   ri   rj   get_cudagraph_partition_device  s   zWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_devicetarget_devicerl   c                 S  s$   | D ]}|  }||kr dS qdS r/  r  )r  r  r   r  ri   ri   rj   all_on_target_device  s   zMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device)r  r_   r}   r  )r  r_   r  r  r}   rl   )r   rw  rH  r  )rh   r  rs  r  r  cudagraph_partition_devicer  r~  ri   ri   rj   r    s&   	

	
z/Scheduler.update_graph_partition_default_devicec                 C  s  |   \}}t|dkrtd d  t|7  < | ||0 t||D ]"\}}t|dks7J dt| |jr@| | q$| || q$W d   n1 sQw   Y  t| j	}t
jj| |dkrt
jjdusnJ |tt
jjksJ d| dtt
jj dS dS )	z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   r]  cudagraph_partitionsz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r  r   r   r  rH  rw  r  r  r  r  rY   r   r$  set_all_partition_namesrv  )rh   r  rs  r  r~  num_partitionsri   ri   rj   r  0  s,   
zScheduler._codegen_partitionsc              	   C  sl  t jr@dd l}t }t }t|D ]-}|jdkr#|j|j	j
jkr# n|j|jf}||vs:J d|j d|j d|| q| j| _| jd u sKJ | jrXt jjrXtjj  tjj  |D ]}ttjrztd| |  W n ty   td|  Y nw |  | t j!rtjj"dd	 |j#j$D  |%  }r|| jks|& s|' r| (  || jkr| jrt)| jj*r| j+d ur| ,  tjj-  || _t)|j*r|j.d usJ d
d}	| / rt| j01 }
|
rt2|
d nd}	tjj3|j.|	| j4 | / r| jd ur| 5| tjj6dd	 |j#j$D  || _7| j89|j: |' rL|;t<|= \}}}| >|?||| nt|& rW| @| ni|A rtBCtD|}| >|}ddlEmF} ddlGmH} ddlImJ} tK||||fr|}n	tLdt*| |M| n,tK|tNr| >|O| ntK|tPtQfr| >|R| ntK|tSsJ |T  t jjUr| >|V  | jW9|X  | jY9|Z  tK|tSs|% }|d ur|j*dkr| >|[ r| (  t\dd	 |= D r|| _q`d | _q`| j| jkr-| jd us J t)| jj*r-tjj-  d | _| (  d S )Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0c                 s  r  rt   r  r   ri   ri   rj   r     ru  z%Scheduler._codegen.<locals>.<genexpr>r  r   c                 s  r  rt   r  r   ri   ri   rj   r     ru  )CUDACombinedSchedulingr  )XPUCombinedSchedulingztype(self)=r%  c                 s  r  rt   )r   r   r  ri   ri   rj   r     r  )]r'   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r   r   filename_dynamoconvert_frame__file__linenor  r  r  r  r   autotune_at_compile_timerY   r   r$  write_get_raw_stream_headerregister_alignment_check_inputsr|  r  r  r  r  r  rh  r{  rY  size_assertscodegen_deferred_input_assertsr   r   r   r  r  rH  rJ   r   current_stream_idxgenerate_stream_ctx_exitr  r   r  r  r   rv  r  r  generate_stream_ctx_switching!codegen_deferred_alignment_copiesr  r  r  rE  r  r   r   r
  codegen_templaterM  r  r  r  r   codegen.cuda_combined_schedulingr  r  r  #codegen.xpu.xpu_combined_schedulingr   r   ru  codegen_combo_kernelr  codegen_mix_order_reductionr   r   codegen_noder  r  debug_sync_kernelcodegen_syncr  r  r  r   ready_to_flushr   )rh   r  r  stackrn  framer  r   r  num_streamsunique_streamsr  r  r  backend_r  r  r   r  ri   ri   rj   r  P  s   















zScheduler._codegen%tuple[float, float, list[str | None]]c                 C  s<   |d   }| tj_|| _|dusJ | |}|||S )r  r   N)r   rY   r   r  r  r
  benchmark_combo_kernel)rh   r  node_benchmark_resultsr  r  ri   ri   rj   r"    s   
z Scheduler.benchmark_combo_kernelc                   s  |}|d    t fdd|D sJ dtjsdS ddlm} dg }}i }t|D ]Z\}}| }	| |	r?t	
d z| |	\}
}|
|f||< t|
r\t	
d	| W  d
S W n  |y} } zdt|v rxt	
d W Y d}~ dS  d}~ww ||
7 }|| q-z| ||\}}}W n |y } zdt|v rt	
d W Y d}~dS  d}~ww || dk p|dk }t	tjr||ks|rt	
dt|| d nt	
dt|| d || |k p|S )r  r   c                 3  s    | ]	}|   kV  qd S rt   r  r  r  ri   rj   r     r  z4Scheduler.speedup_by_combo_kernel.<locals>.<genexpr>z<All nodes in a combo kernel group must be on the same deviceTr  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r   r'   r"  r  r  r  r   r  r  r  r  r  r  r   r   r  r  r  rD   rE   )rh   r  subkernel_nodesr  rY  
path1_listr#  r  r  r  rz  r  r   rZ  	ms2_clone_path2_listsmall_kernelri   r  rj   rQ    s|   


	z!Scheduler.speedup_by_combo_kernel	ir.Layoutc                 C  s"   | j | }|jd usJ |j S rt   )r*  r   
get_layout)rh   r  r   ri   ri   rj   get_buffer_layout_  s   

zScheduler.get_buffer_layoutc                 C  sr   | j D ]3}| r6|jjD ](}tjj|j}|r5t	|dkr5t
|jttfs5| g kr5tjj|j qqd S r  )r  rT   r   r   rY   r   r  r  r   r:   r   r  r>   r=   r  zero_dim_cpu_tensor_listr  )rh   r   r  r  ri   ri   rj   r  d  s   

z$Scheduler.update_zero_dim_cpu_tensorc                 C  s   | j dur	| j jS dS )z:CUDA Stream index that current scheduler node assigned to.N)r  r  rg   ri   ri   rj   r  s  s   
zScheduler.current_stream_idxc                 C  s   | j  }durt|S dS )z9CUDA Stream name that current scheduler node assigned to.N)r  r    )rh   r  ri   ri   rj   current_stream_name{  s   zScheduler.current_stream_namec                 C  s.   t |trJ | j| }tjjj|d| _dS )z6Code-gen to enter the Stream context assigned to node.)r  N)r   r  r  rY   r   r$  codegen_cuda_stream_enterr  )rh   r   node_streamri   ri   rj   generate_stream_ctx_enter  s
   
z#Scheduler.generate_stream_ctx_enterc                 C  s$   | j dusJ tjj  d| _ dS )z1Code-gen to exit from the current Stream context.N)r  rY   r   r$  codegen_cuda_stream_exitrg   ri   ri   rj   r    s   
z"Scheduler.generate_stream_ctx_exitc                 C  s   || j v sJ t|trdn| j | }| j|krdS | jdur%|du r%dS | jdu r5|dur5| | dS |   | | dS )am  Generate stream entering and exiting to properly run node in a multi-stream scenario.

        Stream context switching is only generated if ``node``'s assigned stream is different from
        the previous node's stream. NopKernelSchedulerNodes have stream=None and inherit the
        enclosing stream context (or do nothing if no context is active yet).
        N)r  r   r  r  r0  r  )rh   r   rT  ri   ri   rj   r    s   
z'Scheduler.generate_stream_ctx_switching)r  r  r}   r  )r}   r  r=  r>  )r  r   r}   r   )r  r   r   r^   r}   rl   r@  )r  r9  r}   r  )r   r   r}   r  )r   r]  r}   r^   )r_  rj  r}   rl   r  )r  r^   r}   r  )r}   r  r  r  r}   r  rt   r  r  r  rl   r  r\  r}   r   )r  r   r  r  r}   r  )r  r6  r}   rl   )
r  r7  r  r6  r  r   r   r   r}   r  )r  r  r}   rl   )r  r  r  r\  r}   r  )r{   r^   r|   r^   r}   rb   )r   r^   r}   r^   )r{   r^   r|   r^   r  r$  r}   r^   )r{   r^   r|   r^   r*  rp   r  r$  )r.  r/  r  r$  r}   r  )
r8  r9  r:  r;  r<  r/  r  r$  r  rl   )r  r$  r:  r;  )rH  r9  rI  r9  )r  r  r  rl   r}   r  )r  r\  r}   r  rZ  )r  r  r  rl   r}   r9  r   )r{   r^   r|   r^   r|  r   r}   rl   )r{   r^   r|   r^   r  r  r}   r   r   r   )r  r^   r  r^   r  r  r}   rl   )r{   r^   r|   r^   r}   r  )FT)
r{   r^   r|   r^   r  rl   r  rl   r}   rl   )r	  r5   r{   r^   r|   r^   r}   rl   )r  r2   r
  r3   r}   rl   )r  r2   r
  r4   r  rM  r}   rl   r  )r   r2   r  rl   r}   r   )TFT)r{   r^   r|   r^   r  rl   r  rl   r  rl   r}   r  )rH  r9  r}   r9  )r  r~   r}   r   )rI  r^   r}   r  )r  r  r}   rN  )r  r9  r}   rN  r  )r   r   r~  rD  r}   rl   )r   r^   r}   rj  )r}   re  )r}   rp  )rs  rt  r}   r  )r  r_   rx  rp  r}   re  )r  r  r  r  r}   rt  )r~  r;   r}   r;   )r}   r  )r  r  rs  rt  r}   r  )r  r_   r~  r;   r}   r  )r  r  rs  rt  r}   r  r  r  r}   r!  )r  r  r}   rl   )r  r   r}   r)  r  )r}   rj  )oru   rv   rw   r   r\  r  r  r  r  r  r  propertyr  setterr  r  r  r  r\   r  r  rk  r  r  ry  r  r  r  r3  r  r  r  r  r  r  r  r  r  r  r#  r  r)  r-  r7  rD  rG  rN  r  r  r  rO  r,  r{  r  r  r  r  r  r[  r  r  r   r  r  r  r  r  r  r  r   r]  r^  r  rG  rH  rM  rV  r
  rY  r}  ra  rH   rl  rc  rr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  rQ  r+  r  r  r-  r0  r  r  r  ri   ri   r  rj   r     s     r8
$  M-#+X)
   FQE58.90> ]=f }=9*ofT ?#)D $A 7$+/  2Pc                      s   e Zd ZdN fddZdOddZdPddZdQddZdQddZdQddZdRddZ	dSddZ
dTd%d&Z	'dUdVd-d.ZdWd1d2ZdXd4d5ZdOd6d7ZdYd8d9ZdOd:d;ZdZd=d>Zd[dAdBZd\dDdEZd]dHdIZ	'dUd^dLdMZ  ZS )_rN  r  Scheduler | Nonec                   s   t    || _d S rt   )r  r\  r  r[  r  ri   rj   r\    s   

zBaseScheduling.__init__r}   r  c                 C  s   | j r
| j   d S d S rt   )r  rG  rg   ri   ri   rj   free_buffers_in_scheduler  s   z(BaseScheduling.free_buffers_in_schedulerr  r  OrderedSet[BackendFeature]c                 C  s   t  S )z0Return a set of .codegen.common.BackendFeature()r   r  ri   ri   rj   get_backend_features  r  z#BaseScheduling.get_backend_featuresr{   r^   r|   rl   c                 C  r  )zO
        Check whether node1 and node2 can be vertically fused or not.
        r  r  ri   ri   rj   r       z BaseScheduling.can_fuse_verticalc                 C  r  )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r  r  ri   ri   rj   r    r;  z"BaseScheduling.can_fuse_horizontalc                 C  sr   |  }t|tjsdS | sdS t|jtjr7t|jjdko6t|jjd tj	o6|jjd 
 |
 kS dS )av  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.

        Fr   r   )r  r   r*   r  rU   r   r<   r   r  rk  r  )rh   r{   r|   r  ri   ri   rj   r    s   z.BaseScheduling.can_fuse_multi_outputs_templater   c                 C  s   |  s|  rt||S t||rt||S t|tr#||S t|tr<t|t	r<t|j
tjs6J t||S t||S )z 
        Fuse two nodes
        )r  r  ro   r   r   r  r   r  rQ  r   r   r*   r  r  r  r   r  ri   ri   rj   ro     s   


zBaseScheduling.fuser#  r  "tuple[tuple[sympy.Expr, ...], ...]c                 C  r  )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r  )rh   r#  ri   ri   rj   r    r;  zBaseScheduling.group_fnr  epilogue_nodesr  r  rj  c                 C  r  )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r  )rh   r  r=  r  ri   ri   rj   r    s   zBaseScheduling.codegen_templateNr  r  r  r\  r   c                 C  r  zD
        Generate a kernel given a list of pre-fused nodes.
        r  )rh   r  r  r  ri   ri   rj   r    s   	z.BaseScheduling.generate_kernel_code_from_nodesr   "FusedSchedulerNode | SchedulerNodec                 C  r  r>  r  re  ri   ri   rj   r       zBaseScheduling.codegen_noder  c                 C  r  rt   r  re  ri   ri   rj   r  !  r  z*BaseScheduling.codegen_mix_order_reductionc                 C  r  )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r  rg   ri   ri   rj   r  $  r@  zBaseScheduling.codegen_syncc                 C  r  )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fri   rg   ri   ri   rj   r  *     zBaseScheduling.ready_to_flushc                 C  r  )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r  rg   ri   ri   rj   rH  1  r@  zBaseScheduling.flushr  c                 C  r  )r  r  r  ri   ri   rj   r  7     z$BaseScheduling.benchmark_fused_nodesr  r   c                 C  r  )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r  )rh   r  ri   ri   rj   r  @  rA  z)BaseScheduling.benchmark_codegened_moduler   c                 C  r  )z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   ri   r  ri   ri   rj   r>  G  rB  z'BaseScheduling.get_fusion_pair_priorityr  r!  c                 C  r  )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r  )rh   r  r#  ri   ri   rj   r"  P  rB  z%BaseScheduling.benchmark_combo_kernelnode_scheduler  c                 C  s2   |rddl m} |||}tjj|| d S d S )Nr   )'set_kernel_post_grad_provenance_tracing)r  rD  rY   r   r$  write_provenance_debug_handle)rh   rC  r  rD  debug_handleri   ri   rj   codegen_commentY  s   zBaseScheduling.codegen_comment)r  r7  r=  )r  r  r}   r9  r   r  )r#  r  r}   r<  )r  r^   r=  r  r  r  r}   rj  rt   r3  )r   r?  r}   r  )r   r  r}   r  r>  r2  )r  r   r}   r  r   r4  )rC  r  r  rj  r}   r  )ru   rv   rw   r\  r8  r:  r  r  r  ro   r  r  r  r  r  r  r  rH  r  r  r>  r"  rG  r  ri   ri   r  rj   rN    s.    














	

	rN  )r}   r  )r  r^   r}   r   )r  r^   r}   r  )r  r^   r}   rR  )r  r   r}   r   )r   r^   r  r  r*  rP  r}   r  )rg  rh  r}   r  )rg  rh  r  r   r   r  r}   r  )ri   )r  r  r#  rC  r  r   r}   r  )r5  r6  r  r7  r}   r  r  )rI  r   rJ  r   rK  r   rL  r   rM  rB   r}   rN  )rY  rf  rZ  rf  rI  r   rJ  r   rK  r   rL  r   rM  rB   r}   rl   r>  )r   rd  r}   re  )r   r^   r}   re  )r  r^   r}   rl   )r{   r^   r|   r^   )
__future__r   rW  r  rA  r/  rS  r  r  r  r  r  r  r  r  r  r   r   concurrent.futuresr   r   r   r   r	   r
   r   typing_extensionsr   torch.utils._ordered_setr   r*   r   r   collections.abcr   r   r   typesr   torch._inductor.codegen.wrapperr   r  r   r   r  torch._inductor.async_compiletorch.utils._pytreer  _pytreer  torch._dynamo.utilsr   r    torch._inductor.autotune_processr   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   torch._inductor.stream_utilsr    %torch.fx.experimental.symbolic_shapesr!   torch.utils._sympy.symbolr"   r#   r$   torch.utils._tritonr%   r  r&   r'   r(   r)   r+   analyze_preserves_zero_maskr,   codegen.commonr-   r.   r/   comm_analysisr0   r1   r2   r3   r4   r5   excr6   r7   fx_utilsr8   r9   r:   r;   r<   r=   r>   r   r?   r  r@   rA   runtime.hintsrB   rC   runtime.runtime_utilsrD   rE   r   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   virtualizedrY   	getLoggerru   r|  _logginggetArtifactLoggerr  r7  r/  r  r   r_   rx   r`   ra   	dataclassrb   rz   r   r   rC  r^   rx  rn  rm  r  rs  r  r  r0  r  rQ  r  r   rr  rx  r   r  r  r  rw  r4  rA  rH  rX  r^  r  r  r  rc  rj  rn  rw  r{  r|  r~  r  r  r   rN  ri   ri   ri   rj   <module>   sJ     P

  k      



&


-%  
X
 o^=   	h
.


 








                                     V