o
    j9:j                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( d d	l)m*Z* d dl+Z+d d
l+m,Z,m-Z-m.Z. d dl/m0  m1  m2Z3 d dl4m5  m6Z7 d dl8Z9d dl:Z9d dl;m6  m<Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dlFmGZG d dlHmIZI d dlJmKZKmLZL d dlMmNZNmOZOmPZPmQZQmRZRmSZS d dlTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_ d dl`maZa d dlbmcZc d dldmeZe d dlfmgZgmhZhmiZimjZj d dlkmlZl ddlmmnZnmoZo ddlpmqZqmrZrmsZsmtZtmuZu ddlomvZvmwZwmxZxmyZymzZzm{Z{ ddl|m}Z} ddl~mZmZmZmZ ddlmZ dd lmZmZ dd!l6mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ dd"lmZmZmZ erd d#lTmZ d d$l`mZ d d%lmZ dd&lmZ dd'lmZ dd(lmZ dd)l6mZ neZd*ed+< zd dlZejZd,ZW n ey   dZd-ZY nw e&d.Ze d/Ze d0Ze d1Zee,B Zd*ed2< eeB e,B Zd*ed3< e9jje9jjB Zd*ed4< eeZeje	jd5d6Ze9jjZ	 e!ed7eed7f d8d9ee!eeed7f d7d8d9f dB  f Zd*ed:< dd?d@Zejd,dAG dBdC dCZʐddGdHZːddLdMZ̐ddQdRZ͐ddSdTZΐddZd[Zg d\Zg d]Z	dddbdcZҐddedfZ	dddgdhZedddkdlZe	-dddndlZ	-dddqdlZՐddudvZ֐ddydzZאdd|d}Zؐdd~dZِdddZڐdddZېdddZܐdddZݐdddZdd ZG dd9 d9Zed-dAG dd dZeG dd deZdddZeG dd deZeG dd deZedededededededdZded< 	,dːdddZeG dd deZde-d fdddZeee, ee, gef Zd*ed< G ddń deZG ddǄ deZG ddɄ deZeG dd˄ deZeG dd̈́ deZeG ddτ deZdddфZdddӄZ	,	-		-	dϐddd܄ZdddބZdddZeG dd deZeG dd deZeG dd deZeG dd deZeG dd deZeG dd deZeG dd deZeG dd deZG dd deZeG dd deZeG dd deZ eG dd deZdddZdddZG dd dZeG d d deZG dd deZG dd deZG dd deZG dd	 d	eZ	G d
d deZ
eG dd deZG dd deZed-dAG dd deerZed-dAG dd deeZG dd deZG dd deZG dd deZeG dd deZeG dd deZed-dAG dd deZejd,dAG d d! d!ZG d"d# d#eZG d$d% d%eZeeB eB eB eeeB eB eB  B ZG d&d' d'ZG d(d) d)eZG d*d+ d+eZG d,d- d-eZG d.d/ d/eZG d0d1 d1eZ G d2d3 d3eZ!dՐd7d8Z"ed-dAG d9d: d:eZ#G d;d< d<e#Z$G d=d> d>e$Z%ed-dAG d?d@ d@e#Z&ed-dAG dAdB dBe&Z'G dCdD dDe'Z(G dEdF dFe&Z)G dGdH dHeZ*G dIdJ dJe&Z+G dKdL dLe+Z,G dMdN dNe+Z-G dOdP dPe&Z.G dQdR dRe&Z/G dSdT dTe&Z0G dUdV dVe&Z1G dWdX dXe&Z2G dYdZ dZe2Z3G d[d\ d\e)Z4G d]d^ d^e&Z5G d_d` d`e&Z6G dadb dbe'Z7G dcdd dde&Z8G dedf dfe&Z9G dgdh dhe&Z:G didj dje&Z;ed-dAG dkdl dlZ<G dmdn dne)Z=ed-dAG dodp dpe=Z>G dqdr dre=Z?eG dsdt dteZ@G dudv dve&ZAG dwdx dxeAZBG dydz dzeAZCd֐d}d~ZDG dd de=ZEejG dd deZFG dd7 d7eFZGG dd deFZHed-dAG dd deZIdאddZJed-dAG dd de&ZKed-dAG dd de&ZLdؐddZMed-dAG dd de&ZNG dd de=ZOG dd deZPeG dd dePZQeG dd dePZReG dd dePZSeG dd dePZTG dd de=ZUG dd deUZVG dd deUZWG dd deUZXdِddZYdِddZZdڐddZ[dS (      )annotationsN)Callable	GeneratorIterableIteratorSequence)AbstractContextManagernullcontext)Enum)partial)AnycastClassVarLiteraloverloadSupportsFloatSupportsIntTYPE_CHECKING	TypeAliasTypeVarUnion)assert_neverNeveroverride	ParamSpecSelfTypeIs)patch)ExprIntegerSymbol)identity)GraphModuleSerializer)can_auto_functionalize)metricsget_free_symbols)FakeScriptObject)get_opaque_obj_repris_opaque_value)compute_required_storage_lengthis_boolean_dtype(is_contiguous_for_memory_format_or_falseis_float_dtypemake_channels_last_strides_for
StrideType)&_remove_effect_token_unbacked_bindingscompute_unbacked_bindingsfree_symbolsfree_unbacked_symbolsGuardOnDataDependentSymNodehas_free_unbacked_symbolsIterateExprsrebind_unbackedresolve_unbacked_bindingsShapeEnvSymTypes)Node
OrderedSet)_disable_current_modes)CleanDivFloorDivModModularIndexing)SymT   )configdependencies)BackendFeatureCodegenSymbolget_scheduling_for_deviceindex_prevent_reorderingKernel)Depextract_free_symbols#extract_input_node_reduction_rangesextract_read_writesSymbolUsageCollectorOpsHandlervar_builder)LoopBody)OpCounterCSEOpCountResultReductionType	StoreMode)benchmarker)DevicePropertiesReductionHint)argsortargsort_symcache_on_selfcache_on_self_and_argsceildivconvert_shape_to_inductorconvert_shape_to_symintdeveloper_warningdo_bench_using_profilingdtype_from_sizeget_dtype_sizeget_kernel_metadataGPU_ALIGN_BYTESir_dataclass
is_dynamicis_gpu	sympy_dotsympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_substensor_is_aligned)opsOpsValueV)SympyBoolean)Argument)IntLikeType)CUTLASSTemplate)PythonWrapperCodegen)GraphLowering)IndentedBufferr   rv   TF_P_T_U_V_IntLike_NumLike_OpOverloadsz  prefix	TensorBoxr    IRNode_NodeOrNodesxobjectreturnTypeIs[int | Integer]c                 C  s   t | ttfS N)
isinstanceintr   r    r   Y/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/_inductor/ir.py
_is_static      r   )frozenc                   @  s>   e Zd ZU ded< ded< ded< ded< d	ed
< ded< dS )GraphPartitionSignatureOrderedSet[sympy.Symbol]symbol_inputsz0dict[str, IRNode | sympy.Expr | TorchBindObject]input_nodeslist[IRNode]output_nodeszdict[str, bool]input_deallocationboolskip_cudagraph	list[str]constant_namesN__name__
__module____qualname____annotations__r   r   r   r   r      s   
 r   node_or_nodes_NodeOrNodes | NoneNonec                   s   d fdd  |  d S )Nnodesr   r   r   c                   s   | d u rd S t | ttfr| D ]} | qd S t | tr*|  D ]} | q!d S t | ttttt	j
jjtttttf
sFJ dt|  dd S )NzFound zE, which is not a supported top level IR node. See [Note: Inductor IR])r   listtupledictvalues
ExpandViewDynamicScalarAssertScalarr   sympylogicboolalgBooleanr   r   EffectfulKernelShapeAsConstantBufferOpaqueMultiOutputtype)r   node_check_tensorboxr   r   r      s4   


z%validate_ir.<locals>._check_tensorbox)r   r   r   r   r   )r   r   r   r   validate_ir   s   r   namestrCallable[..., OpsValue]c                   s(   t  tsJ t d fdd}|S )	Nargsr   kwargsr   rq   c                    s   t t | i |S r   )getattrrp   r   r   r   r   r   fn     zops_wrapper.<locals>.fn)r   r   r   r   r   rq   )r   r   r   )r   r   r   r   r   ops_wrapper  s   r   orderSequence[int]&Callable[[Sequence[_T]], Sequence[_T]]c                   s(   t t| tt|  d fdd}|S )NindexSequence[_T]r   c                   0   t  t ks
J  fddtt  D S )Nc                      g | ]} |  qS r   r   .0i)r   	inv_orderr   r   
<listcomp>      z4inverse_reorder.<locals>.reindex.<locals>.<listcomp>lenranger   r   r   r   reindex     z inverse_reorder.<locals>.reindexr   r   r   r   )r   zipr   r   r   r   r   r   r   inverse_reorder  s   r   c                   s   d fdd}|S )Nr   r   r   c                   r   )Nc                   r   r   r   r   )r   r   r   r   r   %  r   z1same_reorder.<locals>.reindex.<locals>.<listcomp>r   r   r   r   r   r   #  r   zsame_reorder.<locals>.reindexr   r   r   r   r   r   same_reorder"  s   r   reindex1&Callable[[Sequence[_U]], Sequence[_V]]reindex2&Callable[[Sequence[_T]], Sequence[_U]]&Callable[[Sequence[_T]], Sequence[_V]]c                   s   d fdd}|S )Nr   r   r   Sequence[_V]c                       | S r   r   r   r   r   r   r   r   .     z fuse_reindexing.<locals>.reindex)r   r   r   r   r   )r   r   r   r   r   r   fuse_reindexing*  s   r   )   r      rD   )   r   r   r   rD   seq#Sequence[int | torch.SymInt | Expr]	shape_envShapeEnv | Nonec                 C  s4   |du st dd | D rt| }|S t|| }|S )z1
    Convert strides to fill order (argsort)
    Nc                 s       | ]}t |ttjfV  qd S r   r   r   r   r   r   sr   r   r   	<genexpr>>      z!get_fill_order.<locals>.<genexpr>)allrZ   r[   )r   r   
sorted_idxr   r   r   get_fill_order8  s
   
r   Sequence[int | Integer]c                   s0   dd t | D   fddtt| D }|S )z
    Convert stride order to fill order
    For channel last format,

    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
    c                 S     i | ]\}}||qS r   r   r   idxposr   r   r   
<dictcomp>M      z+stride_order2fill_order.<locals>.<dictcomp>c                      g | ]} | qS r   r   r   lookupr   r   r   N      z+stride_order2fill_order.<locals>.<listcomp>)	enumerater   r   )r   
fill_orderr   r  r   stride_order2fill_orderF  s   r  c                 C  s>   t | |}dd tt| D }t|D ]\}}|||< q|S )z)
    Convert strides to stride order
    c                 S     g | ]}d qS r   r   r   _r   r   r   r   Y      z$get_stride_order.<locals>.<listcomp>)r   r   r   r  )r   r   r   outr   elemr   r   r   get_stride_orderR  s
   

r  replace_symbols_with_hintsr   c                 C     d S r   r   r   r  r   r   r   ir_node_to_tensor_     r  torch.Tensorc                 C  r  r   r   r  r   r   r   r  c  s   IRNode | Nonetorch.Tensor | Nonec                   s   | d u rd S |rt jjj nt  fdd|  D }t| r, fdd|  jD }nt	
|}|  }|  }t|}t|}t jjj  tj||||d }W d    |S 1 s_w   Y  |S )Nc                      g | ]} |qS r   r   r   shape_fnr   r   r   y  r  z%ir_node_to_tensor.<locals>.<listcomp>c                   r  r   r   r   r  r   r   r   |  r  )sizestridedtypedevice)rr   graphsizevarsoptimization_hintr!   get_sizeis_storage_and_layout
get_layoutr  FlexibleLayoutcontiguous_strides	get_dtype
get_devicer`   r   suppress_guardstorchempty_stridedzero_)r   r  r  r  r  r  tr   r  r   r  i  s.   

valueSequence[_T] | NoneSequence[_T | None] | Nonec                 C  s   t | tr
| s
d gS | S r   )r   r   r-  r   r   r   may_convert_to_optional  s   r1  /IRNode | OutputSpec | torch.device | None | str
str | Nonec                 C  sb   t | ts	| d u r| S t | tjr| jS t | ttfr!t|  S t	d|  dt| j
 d d S )Nzget_device_type(: ))r   r   r)  r  r   r   
OutputSpecget_device_typer'  r   r   r   r   r   r   r7    s    r7  "IRNode | torch.device | None | strc                 C  st   t | }|dv rtt| ddkrdS dS |d u s"t| }d u r$dS ddlm} t|ts5J t|t||S )N)cpucudaxpu_backendtritonTFrD   )TritonScheduling)	r7  r   rE   rI   codegen.tritonr>  r   r   
issubclass)r   r  device_schedulingr>  r   r   r   	is_triton  s   
rB  c                 C  s   t | dkS )Nr9  )r7  r   r   r   r   is_cpu  r   rC  Buffer | TensorBox	alignmentr   c                   s   t | tr|  d u st|  st|  rdS tj fdd|  d d D  }tt	|  d dt
|  d d}t||}tjj|S )NFc                 3  s"    | ]}t t| d V  qdS r   N)r   EqrA   r   rE  r   r   r     s     z-is_aligned_realized_tensor.<locals>.<genexpr>rD   )r   r   maybe_get_strider3   
get_strider!  r   AndOrrG  Lerr   r  r  guard_or_false)r   rE  aligned_stridesaligned_last_dim
is_alignedr   rH  r   is_aligned_realized_tensor  s    

$rS  strides1Sequence[_IntLike]strides2shapec                 C  sn   t |t | krt | t |ksJ t|| |D ]\}}}tjj|dr&qtjjt||s4 dS qdS )zP
    Returns true if the strides are equal, ignoring dimensions of size 1 .
    rD   FT)	r   r   rr   r  r  statically_known_leqrO  r   rG  )rT  rV  rW  dims1s2r   r   r   significant_strides_equal  s   $r\  tensorstridesSequence[int | torch.SymInt]c                 C  s   t | s| S tdd t||  D r| S t||  |  s"| S t| \}}g |j}t|  D ]\}}t	j
j|drE|| ||< q3t|j|j|j||j|j}tt||dS )a  
    Tries to match the strides of the tensor to those in the meta_strides. Strides of insignificant
    dimensions - size 0 or 1 - will be updated.

    If there are real stride differences (NHWC vs NCHW), or the tensor is not realized, then the input will be returned
    c                 s  $    | ]\}}t jj||V  qd S r   rr   r  r  statically_known_equalsr   rZ  r[  r   r   r   r     
    
z2try_match_insignificant_strides.<locals>.<genexpr>rD   datalayout)r"  r   r   rK  r\  r!  as_storage_and_layoutr  r  rr   r  r  rX  FixedLayoutr  r  r  offset	is_pinnedr   ReinterpretView)r]  r^  storage
old_layout
new_strider   r   
new_layoutr   r   r   try_match_insignificant_strides  s.   

rq  gmtorch.fx.GraphModulec                 C  sD   | j jddd }dd t|jD |jd< ddlm} ||  d S )Noutputopr   c                 S  s   g | ]\}}|qS r   r   )r   r   r
  r   r   r   r   
  s    z.gm_original_output_strides.<locals>.<listcomp>user_visible_output_idxs)record_original_output_strides)r  
find_nodesr  r   metatorch._inductor.compile_fxrx  )rr  output_noderx  r   r   r   gm_original_output_strides  s   r}  inputsSequence[IRNode]
list[Expr]c                 C  s@   t  }| D ]}|t| ddO }|t| ddO }qt|S )NFunbacked_only)r=   r&   r!  rK  r   )r~  sym_varsinpr   r   r   get_symbolic_inputs  s
   r  c                 C  sH   t | tr| j} t | tr|  } t | tr| j} t | tr"|  S d S r   )r   r   rf  BaseViewunwrap_view
StorageBoxBufferget_namer   r   r   r   try_get_name  s   


r  c                   @  s2  e Zd ZU dZe Zded< dZded< ej	ddZ
d	ed
< ej	ddZded< ej	ddZded< ej	ddZded< ej	ddZded< eejdddZeejdddZedddZdddZdd%d&Zdd'd(Zdd*d+Zdd,d-Zdd.d/Zdd1d2Zdd4d5Zdd6d7Zddd;d<Z	8ddd@dAZddCdDZ ddFdGZ!ddIdJZ"ddLdMZ#ddOdPZ$ddRdSZ%ddTdUZ&ddWdXZ'ddZd[Z(e)dd]d^Z*dd`daZ+ddbdcZ,ddedfZ-dddidjZ.ddldmZ/ddodpZ0ddqdrZ1ddtduZ2ddwdxZ3ddzd{Z4dd|d}Z5dd~dZ6dddZ7dddZ8ddddZ9dddZ:dddZ;dddZ<dddZ=	ddddZ>dddZ?dddZ@	ddddZAdddZBdddZCdddZDdddZE	ddddZFdddZGdddZHdddZIdddZJdddZKdddZLdddZMdddZNeOre)dddZPdS dS )r   zBase class for all intermediate representation (IR) nodes in TorchInductor.

    Note:
        This is an abstract base class. Most methods raise NotImplementedError
        and must be overridden by concrete subclasses.
    zClassVar[OrderedSet[Any]]_current_originsNzClassVar[int | None]_current_stream_idxF)initOrderedSet[Any]originslist[str] | None	tracebacktorch.fx.Node | Noneorigin_nodedict[str, Any]r   
int | None
stream_idxOrderedSet[Node]r   Generator[None, None, None]c                 c  s.    t j}|| B t _z	d V  W |t _d S |t _w r   )r   r  )r  oldr   r   r   current_origins:  s   
zIRNode.current_originsc                 c  s*    t j}| t _z	d V  W |t _d S |t _w r   )r   r  )r  r  r   r   r   current_stream_idxD  s   zIRNode.current_stream_idxr   r   c                 C  s   t | tttttfS r   )r   ComputedBufferInputsKernelInputBufferrl  TemplateBufferr   r   r   r   is_realized_nodeP  s   zIRNode.is_realized_nodec                 C  s
   t | S r   r   createselfr   r   r   wrap_for_lowering]     
zIRNode.wrap_for_loweringattrr   r-  r   r   c                 C  s   t | || d S r   )r   __setattr__)r  r  r-  r   r   r   _post_init_setattr`  s   zIRNode._post_init_setattrc                 C  sZ   t | j}| d| | dtjrt nd  | dd  | di  | d| j d S )Nr  r  r  r   r  )r=   r  r  rE   debug_ir_tracebackr  format_stackr  )r  r  r   r   r   __post_init__f  s   
zIRNode.__post_init__OrderedSet[str]c                 C     t dd |  D S )Nc                 s      | ]}|j V  qd S r   r   r   depr   r   r   r   r      z(IRNode.get_read_names.<locals>.<genexpr>r=   	get_readsr  r   r   r   get_read_namesq     zIRNode.get_read_namesc                 C     | j S r   )r  r  r   r   r   get_tracebackt     zIRNode.get_tracebackc                 C  r  r   r  r  r   r   r   get_origin_nodew  r  zIRNode.get_origin_nodeOperation | Nonec                 C  r  r   r   r  r   r   r   get_defining_opz  r  zIRNode.get_defining_oplist[Subgraph]c                 C     g S )z'Return subgraphs contained in this noder   r  r   r   r   get_subgraphs}     zIRNode.get_subgraphsc                 C  s   t  }| j}t| tr|  }| jrt |g}|D ]:}t|dr*|jr*||j qt	j
jjdi |jg }t|ts>q|D ]}t	j
jj|d }|rR|| q@q|S )Nstack_trace	postToPre)r=   r  r   ExternKernelr  r  hasattrr  addr)  	_inductordebug _inductor_post_to_pre_grad_nodesgetr   r   #_inductor_pre_grad_node_stack_trace)r  stack_tracesr  r  r   pre_grad_nodes	node_namer  r   r   r   get_stack_traces  s6   






zIRNode.get_stack_tracesTshortenSequence[str]c                 C  s   dt | dd }|rt|dkr|d d  d}|  s!|gS g }|  D ]}|d ||d7 }|d	 q'|g| S )
Nzorigins=r   @   =   z...zstack_traces = {
})r   r   r  appendsplit)r  r  r  stack_trace_strr  r   r   r   common_repr  s   

zIRNode.common_reprlinesSequence[object]	multilinec                 C  sb   t |t | | }t tt|}|r&td|}t| j d| dS t| j d| dS )Nz,
z(
z
)(r5  )r   r  mapr   indentjoinr   r   )r  r  r  r  	new_linesr   r   r   
str_helper  s   zIRNode.str_helpertorch.dtypec                 C  r  r   r  r  r   r   r   r&    r  zIRNode.get_dtypetorch.dtype | Nonec                 C      z|   W S  ty   Y d S w r   )r&  NotImplementedErrorr  r   r   r   maybe_get_dtype  
   
zIRNode.maybe_get_dtypeLayoutc                 C     t dt|  d)Nz#get_layout() is not implemented by !r  r   r  r   r   r   r#    r   zIRNode.get_layoutLayout | Nonec                 C  r  r   )r#  r  r  r   r   r   maybe_get_layout  r  zIRNode.maybe_get_layoutr6  c                 C     |   S r   )r#  r  r   r   r   get_output_spec     zIRNode.get_output_specOutputSpec | Nonec                 C  r  r   )r  r  r  r   r   r   maybe_get_output_spec  r  zIRNode.maybe_get_output_specc                 C  s   t |  tS )z4True for single tensor output (excludes MultiOutput))r   r  r  r  r   r   r   has_tensor_output     zIRNode.has_tensor_outputSequence[Expr]c                 C  r  )Nz!get_size() is not implemented by r  r  r  r   r   r   r!    r   zIRNode.get_sizeSequence[_IntLike] | Nonec                 C  r  r   )r!  r  r  r   r   r   maybe_get_size  r  zIRNode.maybe_get_size)_IntLike | sympy.Rel | Sequence[_IntLike]c                 C  r  r   r!  r  r   r   r   rW       zIRNode.shaper   c                 C     t |  S r   )rm   r!  r  r   r   r   	get_numel  r   zIRNode.get_numelc                 C     t jjt|  dS Nr   rr   r  r  statically_known_truer   rG  r  r  r   r   r   is_zero_elements     zIRNode.is_zero_elementsr3  c                 C     t dt|  )a)  
        If the IRNode refers to data which has not been materialized (e.g.,
        it is a Pointwise/Reduction that could potentially have more
        compute fused into it), realize the IRNode into physical memory,
        ending the possibility of fusing into it, but allowing, e.g., multiple
        users to access the data without having to recompute.

        Check StorageBox.realize for a particularly notable implementation.

        TODO(ezyang): I think, in principle, every IRNode should have an
        implementation of this, and most of the time no-op is OK, but you
        really do have to audit each IRNode for this, so for now, raise
        an error if it's not implemented.  Note that some code in graph.py
        will catch this thrown error and suppress it with a warning.
        zrealize NYI on r  r  r   r   r   realize  s   zIRNode.realizewriterIndentedBuffer | Nonec                 C  r  )Nzcodegen_reference NYI on r  r  r
  r   r   r   codegen_reference     zIRNode.codegen_referencetorch.device | Nonec                 C  r  r   r   r  r   r   r   r'    r  zIRNode.get_devicetorch.devicec                 C     |   }|d us
J |S r   r'  r  r  r   r   r   get_device_or_error     zIRNode.get_device_or_errorc                 C     dS NFr   r  r   r   r   has_exceeded_max_reads  r  zIRNode.has_exceeded_max_reads$Callable[[Sequence[Expr]], OpsValue]c                 C     t t| jr   r  r   r   r  r   r   r   make_loader  r   zIRNode.make_loader Callable[[Sequence[Expr]], Expr]c                 C  r  r   r  r  r   r   r   make_indexer  r   zIRNode.make_indexerrU  c                 C  r  r   r  r  r   r   r   rK    r   zIRNode.get_stridec                 C  r  r   )rK  r  r  r   r   r   rJ    r  zIRNode.maybe_get_stridec                 C  r  r   r  r  r   r   r   r    r   zIRNode.get_namec                 C  r  r   )r  r  r  r   r   r   maybe_get_name!  r  zIRNode.maybe_get_namec                 C  s(   z	|   tjjv W S  ty   Y dS w r  )r  rr   r  graph_inputsr  r  r   r   r   is_input_buffer'  s
   zIRNode.is_input_buffer	thresholdc                 C  r  r  r   r  r"  r   r   r   has_large_inner_fn-  r  zIRNode.has_large_inner_fnusersr   c                 C  r  r   r   r  r%  r   r   r   
mark_reuse0  r  zIRNode.mark_reusec                 C  r  r   r   r  r   r   r   realize_hint3  r  zIRNode.realize_hintc                 C  r  r   r  r  r   r   r   r  6  r   zIRNode.unwrap_viewc                 C  r  r   r  r  r   r   r   freeze_layout9  r   zIRNode.freeze_layoutr   r   allow_paddingc                 C  r  r   r  r  r   r*  r   r   r   freeze_layout_with_stride_order<     z&IRNode.freeze_layout_with_stride_orderc                 C  r  r   r  r  r   r   r   r   freeze_layout_with_fill_orderA  r   z$IRNode.freeze_layout_with_fill_orderr  c                 C  r  r   r  r  r  r   r   r   freeze_layout_with_same_orderD  r   z$IRNode.freeze_layout_with_same_orderexact_stridesc                 C  r  r   r  r  r2  r*  r   r   r    freeze_layout_with_exact_stridesG  r-  z'IRNode.freeze_layout_with_exact_stridesdependencies.ReadWritesc                 C  r  r   r  r  r   r   r   get_read_writesL  r   zIRNode.get_read_writesOrderedSet[Dep]c                 C  
   |   jS r   r6  readsr  r   r   r   r  O  r  zIRNode.get_readsc                 C  r   r   )r   r  r  r   r   r   	num_readsR  r   zIRNode.num_readsr~   c                 C  r  r   r  r  r   r   r   get_storage_numelU  r   zIRNode.get_storage_numelr  r   c                 C  r  r   r  r  r  r   r   r   get_free_symbol_usesX  r-  zIRNode.get_free_symbol_usesc                 C  r  r   r  r  r   r   r   get_reduction_type]  r   zIRNode.get_reduction_typec                 C  r  r   r  r  r   r   r   get_reduction_size`  r   zIRNode.get_reduction_sizec                 C  r  r  r   r  r   r   r   	is_externc  r  zIRNode.is_externc                 C  r  r  r   r  r   r   r   is_no_opf  r  zIRNode.is_no_opr  c                 C  r  r   r  r  r   r   r   constant_to_devicei  r   zIRNode.constant_to_devicec                 C  r  r   r  r  r   r   r   get_mutation_namesl  r   zIRNode.get_mutation_namesc                 C  r  r   r  r  r   r   r   get_operation_nameo  r   zIRNode.get_operation_namec                 C  r  r   r  r  r   r   r   get_inputs_that_alias_outputr  r   z#IRNode.get_inputs_that_alias_outputc                 C  r  r   r   r  r   r   r   r  w  r  zIRNode.dtype)r  r  r   r  )r  r  r   r  r   r   r   r   r   r   )r  r   r-  r   r   r   r   r   r   r  )r   r  r   r  r   r  r   r  T)r  r   r   r  )TT)r  r  r  r   r  r   r   r   r   r  )r   r  r   r  )r   r  r   r6  )r   r  r   r   r   r  )r   r  )r   r  r   r   r   r3  r   r
  r  r   r   r   r  r   r  r   r  r   r  r   rU  r   r   r"  r  r   r   r%  r   r   r   Fr   r   r*  r   r   r   r   r   r   r   r  rU  r   r   r2  rU  r*  r   r   r   r   r5  r   r7  r   r   r   r~   r  r   r   r   r  r  r   r   r   r  )Qr   r   r   __doc__r=   r  r   r  dataclassesfieldr  r  r  r   r  staticmethod
contextlibcontextmanagerr  r  r  r  r  r  r  r  r  r  r  r  r  r  r&  r  r#  r  r  r  r  r!  r  propertyrW  r  r  r	  r  r'  r  r  r  r  rK  rJ  r  r  r!  r$  r'  r(  r  r)  r,  r/  r1  r4  r6  r  r;  r<  r>  r?  r@  rA  rB  rC  rD  rE  rF  r   r  r   r   r   r   r   %  s   
 









!







































c                   @  s   e Zd Zd9ddZd:ddZd;d	d
Zd<ddZd=ddZd>ddZd?ddZ	d@ddZ
dAddZdAddZdBd d!ZdCd#d$ZdDd&d'ZdEd)d*ZdFd,d-ZdGd/d0Z	1dHdId3d4ZdJd6d7Zd8S )K	Operationr   r   c                 C  s   d | _ i | _d S r   )operation_name_config_patchesr  r   r   r   r  }  s   
zOperation.__post_init__r  c                 C     t r   r  r  r   r   r   r'    r  zOperation.get_devicer  c                 C     t | dsJ | jS Nr  )r  r  r  r   r   r   r       zOperation.get_origin_noder  c                 C  rw  )Nr  )r  r  r  r   r   r   get_origins  ry  zOperation.get_originsr  c                 C  rw  )Nr  )r  r  r  r   r   r   get_stream_idx  ry  zOperation.get_stream_idxr   c                 C  s   | j d usJ | j S r   )rs  r  r   r   r   rE    ry  zOperation.get_operation_namer  c                 C  r  )zHGet config patches for this operation (e.g., coordinate_descent_tuning).rt  r  r   r   r   get_config_patches     zOperation.get_config_patchespatchesc                 C  s
   || _ dS )z&Set config patches for this operation.Nr|  )r  r  r   r   r   set_config_patches     
zOperation.set_config_patchesr   c                 C  r  r  r   r  r   r   r   rA    r  zOperation.is_externc                 C  r  r  r   r  r   r   r   rB    r  zOperation.is_no_opr5  c                 C  ru  r   rv  r  r   r   r   r6    r  zOperation.get_read_writesr   c                 C  s   ||   v S r   )r  r  r   r   r   r   
is_user_of  r   zOperation.is_user_ofr  c                 C  r  )Nc                 s  r  r   r   r  r   r   r   r     r  z+Operation.get_read_names.<locals>.<genexpr>r  r  r   r   r   r    r  zOperation.get_read_namesr7  c                 C  r8  r   r9  r  r   r   r   r    r  zOperation.get_readslist[Buffer]c                 C  ru  r   rv  r  r   r   r   get_outputs  r  zOperation.get_outputsr   c                 C     t  S r   r<   r  r   r   r   get_unbacked_symbol_defs  r  z"Operation.get_unbacked_symbol_defsFr  c                 C  r  )a  
        When unbacked_only=True:
        Returns the unbacked symbols which are required to be in scope in
        order to successfully perform codegen for this buffer.  For example,
        a buffer that corresponds to an extern kernel call that takes i0 as
        an argument would return {i0} here.  This is used to generate necessary
        dependencies that ensure we actually bind i0 in codegen before you
        try to use it.

        Note that this is NOT transitive; in particular, if this buffer takes
        in as input another buffer with dynamic shape (e.g., (i0,)), we will
        not report it here, because you will already have a dependency
        on that buffer, which will eventually have a dependency on i0 if
        necessary.

        When unbacked_only=False:
        Similar to `unbacked_only=True` but including all free symbols
        instead of only free unbacked symbols.
        r<   r=  r   r   r   r>    s   zOperation.get_free_symbol_usesr   c                 C  r  )z
        Gets extra global memory size needed by this buffer.
        Some algorithms (e.g. group gemm) may require extra global memory in the generated code.
        r   r   r  r   r   r   get_workspace_size  s   zOperation.get_workspace_sizeNrI  rW  rK  )r   r  )r   r  r\  )r   r  )r  r  r   r   rR  rd  )r   r   r   r   rJ  re  r   r  r   r   r_  rh  rf  )r   r   r   r  r'  r  rz  r{  rE  r}  r  rA  rB  r6  r  r  r  r  r  r>  r  r   r   r   r   rr  {  s(    















rr  c                   @  s*  e Zd ZU ded< ded< ded< ded< ed 		dQdRddZdSddZdTddZeZdUddZ	dVddZ
dWddZdWd d!ZedXd&d'ZeejfdYd*d+ZedZd-d.Zd[d0d1ZedTd2d3Zd\d]d7d8ZdQd^d:d;Zd_d?d@Zd`dBdCZdadDdEZdbdGdHZdWdIdJZdcdLdMZdddOdPZd4S )eLoopsr  r  r  r  Callable[..., Any]inner_fnrU  rangesFr  r   r   r   c                   s,   t  jg  fdd| jD |  R  S )Nc                 3      | ]}t | V  qd S r   r%   r   er  r   r   r         z-Loops.get_free_symbol_uses.<locals>.<genexpr>)r=   unionr  inner_fn_free_symbolsr=  r   r  r   r>    s
   zLoops.get_free_symbol_usesnamesr  r   c                   sF     d jj dt j  g fdd|D  d jg S )N'c                   s    g | ]}| d t  | qS =)r   r   r   r  r   r   r     s     z!Loops._to_str.<locals>.<listcomp>origin_node=)r  r  r   r   r  inner_fn_strr  )r  r  r   r  r   _to_str  s   zLoops._to_strc                 C  
   |  dS Nr  r  r  r   r   r   __str__  r  zLoops.__str__r  c                 C  r  r   r  r  r   r   r   r'    r  zLoops.get_devicer  c                 C  r  r   r  r  r   r   r   r    r  zLoops.get_origin_noder  c                 C  r  r   r  r  r   r   r   r!    r  zLoops.get_sizec                 C  r  r   r  r  r   r   r   get_pointwise_size  r  zLoops.get_pointwise_sizer   r   r   r   c                 O  sN   | dd }| dd }| |i |}|d| |d|p |j t|S )Nr  r  )popr  r  r   r  )clsr   r   r  tbrr   r   r   r     s   
zLoops.creater   rC   c                   s    fddt | D S )Nc                   s*   g | ]\}}|d krt jjnt |qS rD   )r   SZerorl   )r   nr   r   r   r   r         z Loops._index.<locals>.<listcomp>)r  )r  r   r   r   r   _index  s   
zLoops._indexrT   c              	   C  s   t t }t|2 ttdd | j|    |	 W  d    W  d    S 1 s0w   Y  W d    d S 1 s@w   Y  d S Nallow_indexingT)
rS   rr   MockHandlerset_ops_handlerr   r   r$  r  inner_fn_argsgetvalue)r  	opcounterr   r   r   inner_fn_opcount  s   RzLoops.inner_fn_opcountSequence[Sequence[_IntLike]]c                 C  s   |  | jfS r   )r  r  r  r   r   r   r    r   zLoops.inner_fn_argsc                 C  s   t jj| jg|  R  S r   )rr   KernelFormatterHandlerir_to_stringr  r  r  r   r   r   r     s
   zLoops.inner_fn_strNr"  r  c                 C  s&   |d u rd}t |tj}|  j|kS r  )maxrE   realize_opcount_thresholdr  num_opsr#  r   r   r   r$  &  s   zLoops.has_large_inner_fnOrderedSet[Symbol]c                 C  s   |  | j}t| j||dS Nr  )r  r  rM   r  )r  r  r   r   r   r   r  ,  s   zLoops.inner_fn_free_symbolssymbolr    r  c                 C  sR   |  | j}t|}t| | | W d    |jS 1 s!w   Y  |jS r   )r  r  rP   rr   r  r  usages)r  r  r   handlerr   r   r   collect_inner_fn_symbol_usage1  s   
z#Loops.collect_inner_fn_symbol_usager7  c                 C  sv   t tdd* |  r t|  |  |  jW  d    S t|  |  jW  d    S 1 s4w   Y  d S r  )	r   r   r$  r?  rO   r  r!  r@  r:  r  r   r   r   r  :  s   $zLoops.get_readsc                 C     t |  jS r   )r=   r  read_buffersr  r   r   r   r  H  r   zLoops.get_read_namesr   c                 C  r  r   )r   r  r  r  r   r   r   r;  K  r   zLoops.num_readsc                 C  r  )Nz+get_reduction_size() is not implemented by r  r  r  r   r   r   r@  N     zLoops.get_reduction_sizer3  c                 C  r  )Nz+get_reduction_type() is not implemented by r  r  r  r   r   r   r?  S  r  zLoops.get_reduction_typer   c                 C  r  )Nz+constant_to_device() is not implemented by r  r  r  r   r   r   rC  X  r  zLoops.constant_to_devicer_  rh  )r  r  r   r   r\  rW  rK  rS  )r   r   r   r   r   r   )r  rU  r   rC   r   r  )r   rT   r   r  r   r]  r  r   r   r  )r  r    r   r  re  rJ  rf  rU  ri  ) r   r   r   r   r]   r>  r  r  __repr__r'  r  r!  r  classmethodr  rn  rC   INDEXr  r\   r  r  r  r$  r  r  r  r  r;  r@  r?  rC  r   r   r   r   r    sB   
 






	

	



r  r   Expr | Sequence[Expr]r  r  rq   c                C  s"   |j rttd|S td|S )Nnanr   )is_floating_pointrp   constantfloat)r   r  r   r   r   nop_loader_fn^  s   r  c                   @  sL   e Zd ZdddZdddZeZdd	d
ZdddZd ddZd!ddZ	dS )"	Pointwiser   r  c                 C  s   |   rtt| jdS | jS Nr  )r  r   r  r  r  r  r   r   r   r  g  s   zPointwise.make_loaderr   c                 C  r  r  r  r  r   r   r   r  n  r  zPointwise.__str__Sequence[sympy.Expr]c                 C  r  r   r   r  r   r   r   r@  s  r  zPointwise.get_reduction_sizer3  c                 C  r  r   r   r  r   r   r   r?  v  r  zPointwise.get_reduction_typeoutput_nameindexer!Callable[[Sequence[Expr]], Never]varsr  r   c                 C  s"   |   }t|p	d||||S Nunnamed)r  rp   storer  r  r  r  loaderr   r   r   store_outputy  s   zPointwise.store_outputr  r  r   c                 C  s.   |   }ttd||}t|| j|| jdS FMove this to a given device. Requires that all reads are to constants.override_devicer  r  r  r  )r  r   r   ConstantBufferr  r  r  r  r  r  r   r   r   rC    s   zPointwise.constant_to_deviceNrY  r\  )r   r  rU  )r  r3  r  r  r  r  r   r   ri  )
r   r   r   r  r  r  r@  r?  r  rC  r   r   r   r   r  e  s    




	r  c                   @  s6   e Zd ZU ded< dZded< dd
dZdddZdS )Scatterr  output_indexerNrV   scatter_moder  r  r   r   c                 C  s6   |   }ttd||}t|| j|| j| j| jdS )r  r  )r  r  r  r  r  r  )	r  r   r   r  r  r  r  r  r  r  r   r   r   rC    s   zScatter.constant_to_devicer  r3  r  r  r  r  r   c                 C  s6   |   }|d u r
d}tj||| |||| jdS )Nr  )mode)r  rp   r  r  r  r  r   r   r   r    s   zScatter.store_outputri  )r  r3  r  r  r  r  r   r   )r   r   r   r   r  rC  r  r   r   r   r   r    s
   
 
r  
logical_ormaximumminimummulr  bitwise_xor)anyr  minprodsumdotxor_sumz"dict[str, Callable[..., OpsValue]]REDUCTION_COMBINE_FNreduction_typearg_break_ties_leftCallable[..., object]c                   sR   t v rt  S dv rd fdd}|S d	kr"ddd}|S td )Nargmaxargminatuple[object, object]br   tuple[OpsValue, OpsValue]c                   s   | \}}|\}}dkrt ||}nt ||}t ||}trCt ||}t ||}	t |t ||	}t |t ||	} rKt ||nt ||}
t |t ||
}t |||t |||fS )Nr  )	rp   ltgteqr-   ner  logical_andwhere)r  r  a_valuea_indexb_valueb_indexmaskequala_isnanb_isnantier  r  r  r   r   argmax_combine_fn  s&   
z3get_reduction_combine_fn.<locals>.argmax_combine_fnwelford_combine#tuple[OpsValue, OpsValue, OpsValue]c                 S  sR   | \}}}|\}}}|| }|| }	||	 }
|||
  || || | |
  |	fS r   r   )r  r  a_meana_m2a_weightb_meanb_m2b_weightdelta
new_weight	w2_over_wr   r   r   welford_combine_fn  s   


z4get_reduction_combine_fn.<locals>.welford_combine_fnzunknown reduction_type=)r  r  r  r  r   r  )r  r  r  r  r   r  )r  r  )r  r  r  r  r  r   r  r   get_reduction_combine_fn  s   
r  c                      s\  e Zd ZU ded< ded< ded< ded< dkddZeZed dldm fddZdnddZdoddZ	dpd d!Z
dqd#d$Zdrd&d'Zdldmd(d)Zdsd-d.Ze	/dtdud:d;Zedvd>d?Zeejd/fdwdBdCZedxdFdGZedxdHdIZedydMdNZedzdPdQZe	/dtd{dWdXZed|d`daZed}dedfZe	/dtd~dgdhZeddidjZ  ZS )	ReductionrU  reduction_rangesrU   r  r  	src_dtyperY   reduction_hintr   r   c                 C  r  )N)r  r  r  r  r  r   r   r   r    r  zReduction.__str__Fr  r   r  c                   s(   t   t j fdd| jD  B S )Nc                 3  r  r   r%   r  r  r   r   r     r  z1Reduction.get_free_symbol_uses.<locals>.<genexpr>)superr>  r=   r  r  r=  	__class__r  r   r>    s   zReduction.get_free_symbol_usesr  c                 C  r  r   )r  r  r   r   r   r@    r  zReduction.get_reduction_sizer3  c                 C  r  r   )r  r  r   r   r   r?    r  zReduction.get_reduction_typer  r  r  r  reduction_varsSequence[Symbol]r   c              	   C  s8   t | j| j| j| ||}t |pd||| d S r  )rp   	reductionr  r  r  r  store_reduction)r  r  r  r  r"  r-  r   r   r   r%    s   
zReduction.store_reductionr   c                 C     t | jt | j S r   )r   r  r  r  r   r   r   index_length#  r   zReduction.index_lengthSequence[Sequence[Expr]]c                 C  s$   |  | j}|  | jtj}||fS r   )r  r  r  rC   R0_INDEX)r  r   rindexr   r   r   r  &  s   zReduction.inner_fn_argsc                 C  s.   |  | j}|  | jtj}t| j|||dS r  )r  r  r  rC   r)  rM   r  )r  r  r   r*  r   r   r   r  +  s
   
zReduction.inner_fn_free_symbolsr  r  r   c              
   C  s>   |   }ttd||}t|| j|| j| j| j| j	t
jdS )r  r  r  r  r  r  r  r  r  r  )r  r   r   r  r  r  r  r  r  r  rY   DEFAULTr  r   r   r   rC  2  s   zReduction.constant_to_deviceN	dst_dtyper  Callable[_P, OpsValue]r  ReductionType | Literal['scan']reduction_numelr   
input_noder  tuple[ReductionHint, _IntLike]c	           #   
   C  s  |t |g}	tjj|	stjdfS tjj|}
tjjt |}|dkp5tj| t	j
 o5|dvo5tj}|dkr?tjdfS t| }|j}d}|r`tjtjj| dd}tjtjj| dd}nddd}|}|dkr||
|}|dkrytj|fS |d urt|trttdd t|\}}W d    n1 sw   Y  |d ur|d urtjjt || }|
|krtd||||| tjdfS tj|fS |
|ks||d d krtjdfS t| |||||dkr|nd|tjd}ddd}||\}}|r||\}}t|dkrtjdfS t !|" |# \\}}}d}d}|D ].}tjj$||} tjj%| |t&|' }!t(dd |!D }"|"rK|d7 }q"|d7 }q"||kr^tj||
|fS tj)||
|fS )NrD   scanr  r      T)inner_reductionFreduction_numel_hintr   
numel_hintr   c                 S  r  NrD   r   )r6  r7  r   r   r   inner_reduction_splitso     z4Reduction.num_splits.<locals>.inner_reduction_splitsr  zUse previous IRNode's range and reduction_ranges instead of split. current ranges: %s, current reduction ranges: %s, current split: %d, new ranges: %s, new reduction ranges: %srI  r   r  r+  r  r  tuple[Sequence[Expr], bool]c           	        s   |   }|d us
J td t||  |  d| d}| }|jd us%J dd |jD }g }d}t|jdd dD ]7 t	 fd	d
|D rq|
 j  jtjjv rqtjj j }t|jdd }|  t|jdd |krqd}q:||fS )Nr  r  r  r   rg  rf  c                 S  s&   g | ]}t |trt |tjs|qS r   )r   r   r   Numberr   r  r   r   r   r     s    
zBReduction.num_splits.<locals>.get_read_indices.<locals>.<listcomp>Fc                 S  r  r   r   r   r   r   r   <lambda>      z@Reduction.num_splits.<locals>.get_read_indices.<locals>.<lambda>keyc                 3  s    | ]	}| j jv V  qd S r   )r   r2   r?  mdr   r   r         zAReduction.num_splits.<locals>.get_read_indices.<locals>.<genexpr>r  T)r'  r  r$  r&  r!  r6  
range_varssortedr:  r   r  r   r   rr   r  name_to_bufferr   rg  decide_layout)	r  r  cbread_writesrG  indiceschangedbuforiginal_strider   rD  r   get_read_indices  s:   	z.Reduction.num_splits.<locals>.get_read_indicesr   c                 s  s     | ]}|d kp|dkV  qdS )r   rD   Nr   r   r   r   r   r     r   z'Reduction.num_splits.<locals>.<genexpr>)r6  r   r7  r   r   r   )r  r  r   r;  )*rm   rr   r  r  all_unbacked_explicitly_hintedrY   r,  r   has_featurerG   REDUCE_TO_SINGLE_ELEMENTrE   split_reductionsrX   r  multi_processor_count	functoolsr   choicesreduction_split_factorINNERr   r   r   r   r$  rN   !replace_backed_symbols_with_hintslogr  r  r   rF   index_vars_squeezer!  r@  simplify_with_rangesstride_hintsr   keysr   OUTER)#r  r-  r  r  r  r  r  r0  r1  exprsr6  r7  should_splitpropsnum_smmin_elements_per_threadr9  outer_reduction_splitsr  
new_rangesnew_reduction_rangesextracted_numel_hintr  rQ  rM  rN  r
  r"  ranges1	num_outer	num_innerr   jr^  outerr   r   r   
num_splitsA  s   













#

zReduction.num_splits<Callable[[Sequence[_IntLike], Sequence[_IntLike]], OpsValue](Callable[[Sequence[_IntLike]], OpsValue]c                   sf   t jjt|| d fdd|dv r/ttdfd
dfddS S )z1Convert inner_fn from a reduction to an pointwiser   rU  r   r   c                   s,   t  fddtjdd D  D S )Nc                 3  s    | ]} |V  qd S r   r   )r   r*  )r   value_fnr   r   r     s
    
z=Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<genexpr>c                 S  s   g | ]}t |qS r   )r   r   r   r   r   r   r      r  z>Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<listcomp>)rW  reduce	itertoolsproductr   )
combine_fnr  rs  r   r   r     s   z*Reduction._unroll_reduction_fn.<locals>.fnr  r  r*  r  c                   s*   dd |D }| |t  |tjfS )Nc                 S     g | ]}t |qS r   r   expandr   r   r   r   r     r   zDReduction._unroll_reduction_fn.<locals>.value_fn.<locals>.<listcomp>)rp   
index_exprr)  int64r   r*  )flatten_indexr  r   r   rs    s   z0Reduction._unroll_reduction_fn.<locals>.value_fnc                   s    | d S r8  r   r   )r   r   r   r@    s    z0Reduction._unroll_reduction_fn.<locals>.<lambda>N)r   rU  r   r   )r   rU  r*  rU  r   r  )rr   r  r  guard_int_seqr  _fixed_indexerr$  r%  )r  r  r  r  r   )rx  r  r   r  r  rs  r   _unroll_reduction_fn  s   
	zReduction._unroll_reduction_fnr  r   c
                   s  t jjtdkrBd# fdd}
|
d|
d|
d|
dd	v s.J  d
d$ fdd}tj|||t|dS dkrcdv rRd$ fdd}nd$fdd}tj| ||dS tt	rt
tjk rt|dkszt|jrdkrtj| | ||dS | | |||		\}}d%fdd}||}|tjkr|}|dkr|	dusJ ttdd t|	\}}W d   n1 sw   Y  |dusJ |dusJ | | |||||
S |dkr<| | |||||	
}d}tjjrt|trd&d d!}||}|r:t|jts*J t|j |jjd |_|_ ||_!|_"|S tt| |||d"}|S )'zy
        Create a reduction node. May split the reduction to multiple layers to expose
        more parallelism.
        r   valr   r   bool | float | intc                   sT    t jkr	t| S  jrt| tsJ t| t| S t| ts&J t| t| S r   )	r)  r   r  r   r   r   r  r   r   r  r-  r   r   py_cnst2  s   
z!Reduction.create.<locals>.py_cnstrD   )r  r  r  r  z* not supported for zero-dimension tensors!r   r   rq   c                   s   t   S r   rp   r  r   )r-  r  rtypes_to_initsr   r   const_fnH     z"Reduction.create.<locals>.const_fnr  ry  c                   s   t d S r  r  r   r  r   r   r   V  r   zReduction.create.<locals>.fnc                      dd D } | |S )Nc                 S     g | ]}t jjqS r   r   r  r  r	  r   r   r   r   \  r  z0Reduction.create.<locals>.fn.<locals>.<listcomp>r   r   reduction_index)r  r  r   r   r   [     
r  r  c                   s$   t  r| S | dkrt| tjS | S r8  )r   r  rE   min_num_split)r  )r0  r   r   _maybe_increase_split  s
   z/Reduction.create.<locals>._maybe_increase_splitrI  Nr  Tcur_noder   ComputedBuffer | Nonec                 S  sd   |   }t|dkrd S tt|}|tjjvrd S tjj| }t|ts'd S |j	
 d us0J |S r8  )r  r   nextiterrr   r  rI  r   r  rf  r?  )r  
read_namesbufnamerO  r   r   r   _find_split_reduction  s   
z/Reduction.create.<locals>._find_split_reductionr+  )r  r   r   r  )r   r   r   rq   )r  r   r   r   )r  r   r   r  )#rr   r  r  simplifyrm   r  r  r   r   r   r   rE   unroll_reductions_thresholdri   r   r  rp  rY   r,  r   r   r$  rN   !create_multilayer_existing_rangescreate_multilayerr=  mix_order_reductionr   rf  r  r  _split_size_original_inner_fn_original_ranges_original_reduction_ranges)r  r  r-  r  r  r  r  r  r  r1  r  r  r   hintr  r  rh  ri  r  split_reductionr  r   )r-  r  r0  r  r  r  r   r    s   

	




zReduction.creater  _NumLike | Sequence[_NumLike]c              	   C  s   | dv rt |rtdS t|rdS t|jS | dv r0t |r$tdS t|r*dS t|jS t|r6dnd}t|r>dnd}||||||||f|||ftd|fd	|  S )
N)r  r  z-infF)r  r  infTr   rD   )r  r  r  r  r  welford_reducer  online_softmax_reduce)r-   r  r+   r)  iinfor  r  )r  r  zerooner   r   r   default_accumulator  s2   
	zReduction.default_accumulatorc                 C  s   | dkrdS t | |S )Nr  r   )r  r  r  r  r   r   r   default_value  s   zReduction.default_valuer  r~   r7  c                 C  sP   | dkr|S | dkr|dkr|t jkrt jS | dkr&|dkr&|t jkr&t jS |S )NrI     i      )rY   ra  
OUTER_TINY)r  r7  r  r   r   r   _multilayer_second_step_hint  s   
z&Reduction._multilayer_second_step_hintr  c                 C  s   |du rdS t jj| |sdS |  zt| W n
 ty&   Y dS w | }t	|dd D ]\}}t jj|drC|  S q3dS )z
        If we are reducing over the full tensor, and it is non-dense in the last dimension,
        reindex so we reduce over the dense dimension. initially just handle complete
        reduction case
        NrI  rD   )
rr   r  r  rb  r  r	  rh  r  rK  r  )r  r0  r1  r^  r   r   r   r   r   $check_for_split_dense_dim_reindexing%  s$   	z.Reduction.check_for_split_dense_dim_reindexingr  r   
block_sizedefaultr  c           
        sV   |  |}t|g|tjjtt	|d d	 fdd}	|	S )
Nr   r   r#  r  r   rq   c                   sl   |\}| ^ }| |  d fdd}r3t }tt |t|}t||S | S )Nr   rq   c                     s    gS r   r   r   )rM  r  	new_indexr   r   r   bodya  r  zCReduction._multilayer_wrap_loader.<locals>.wrapper_fn.<locals>.body)r   rq   )rc   rp   r  r}  masked)r   r  reduction_blockr  index_dtyper  r  r  r  	need_maskr0  r   )rM  r  r   
wrapper_fnZ  s   


z5Reduction._multilayer_wrap_loader.<locals>.wrapper_fn)r   r#  r  r#  r   rq   )
r  Viewdynamic_reshape_indexerrr   r  r  r  r   rG  rA   )
r  r  r  r0  r  r  r  r1  dense_indexr  r   r  r   _multilayer_wrap_loaderE  s   z!Reduction._multilayer_wrap_loader4Callable[[Sequence[Expr], Sequence[Expr]], OpsValue]original_rangesoriginal_reduction_rangesrh  Sequence[Integer]ri  @Callable[[Sequence[sympy.Expr], Sequence[sympy.Expr]], OpsValue]c                   sN   t dd D sJ dt|t|t| d fd	d
}|S )Nc                 s      | ]}|d kV  qdS )rD   Nr   r?  r   r   r   r   y      zDReduction._multilayer_wrap_loader_existing_ranges.<locals>.<genexpr>z8Only enabled for numel_hint == 1, found original_ranges=merged_indexr  new_reduction_indexr   rq   c                   s:   | d t  }| t d  } |t|t| S r   )r   r   )r  r  original_idxr  r  r  r   r   r   r    s   zEReduction._multilayer_wrap_loader_existing_ranges.<locals>.wrapper_fn)r  r  r  r  r   rq   )r   r  r  r   )r  r  r  r  rh  ri  r  r   r  r   '_multilayer_wrap_loader_existing_rangesp  s   	z1Reduction._multilayer_wrap_loader_existing_rangesr  r  list[Integer]c                   s   |t jt jfvr
|nt j}t|||||||	|}|  |  d
 fdd}tj	j
t|}| |
||}||dt| ksDJ tt|||||t|d |	||d	S )a
        Break a large reduction up into multiple smaller reductions
        recursively
        r   rU  r  r   rq   c                   s    g | |S r   r   r  intermediate_loaderr   r   intermediate_fn  s   z;Reduction.create_multilayer_helper.<locals>.intermediate_fnNr+  )r   rU  r  rU  r   rq   )r)  float16bfloat16r  r  r  r	  r  rr   r  r  r   rm   r  r   r   )r  r  r-  r  r  r  r  rh  ri  r  r  r  intermediate_dtypeintermediater  r7  r   r  r   create_multilayer_helper  sD   
z"Reduction.create_multilayer_helperc                 C  sd   t |}t||d  |}| ||}| |||||||
}| ||||||g |||g|||	S )r  rD   )rm   r@   r  r  r  )r  r  r-  r  r  r  r  r  r  r  r1  r0  r  r  r  r   r   r   r    s2   

zReduction.create_multilayerc                 C  s8   |  |||||}| ||||||g ||||	d|
S )r  rI  )r  r  )r  r  r-  r  r  r  r  rh  ri  r  r  r  r   r   r   r    s(   
z+Reduction.create_multilayer_existing_rangesr\  r_  r  rS  rU  )
r  r3  r  r  r  r  r"  r#  r   r   rf  r   r(  ri  r   )r  r  r-  r  r  r  r  r.  r  rU  r  rU  r  r/  r0  r   r1  r  r   r2  )
r  rq  r  rU  r  r   r  r  r   rr  )r  r  r-  r  r  r  r  r  r  r  r  r  r  rU   r  rY   r1  r  r   r   r  r   r  r  r   r  )r  r~   r7  r   r  rY   r   rY   )r0  r~   r1  r  r   r  )r  r   r  rU  r0  r~   r  r~   r  r~   r  r  r1  r  r   r  )r  r  r  r  r  r  rh  r  ri  r  r   r  )r  r  r-  r  r  r  r  r  r  r  r  r  rh  r  ri  r  r  rU   r  r~   r  rY   r   r   )r  r  r-  r  r  r  r  r  r  r  r  r  r  rU   r  r~   r  rY   r1  r  r   r   )r  r  r-  r  r  r  r  r  r  r  r  r  rh  r  ri  r  r  rU   r  rY   r   r   ) r   r   r   r   r  r  r]   r>  r@  r?  r%  r'  r  r  rC  rn  rp  r  r  rY   r,  r  r  r  r  r  r  r  r  r  r  __classcell__r   r   r   r   r    s^   
 







 -+ Q	*?-r  r  r  Sequence[int] | Nonerj  r   r  c                   s   d fdd}|S )	1A closure containing math to read a given elementr   r   r   r   c                   sd   d urt | t ksJ t | t ksJ  }t| D ]\}}}|dkr/|||  }q |S r8  )r   r   )r   resultr   stszrj  r  r  r   r   r  *  s   z_fixed_indexer.<locals>.indexerN)r   r   r   r   r   )r  r  rj  r  r   r  r   r  #  s   	r  INNER_FN_TYc                      s2   e Zd ZU ded< d fddZd ddZ  ZS )!MultiOutputReductionr   output_indexr  r  r-  r  	inner_fns#INNER_FN_TY | Sequence[INNER_FN_TY]r  r  r  r  rU   r  r  rY   c
              
     sX   t  r f t dkr d }
nd fdd	}
t j|||
|||||d
 |	| _d S )NrD   r   r   r  reduction_idxr   tuple[OpsValue, ...]c                   s   t  fddD S )Nc                 3  s    | ]}| V  qd S r   r   r   r   r   r  r   r   r   S  r  z@MultiOutputReduction.__init__.<locals>.loader.<locals>.<genexpr>)r   r  r  r  r   r  P  s   z-MultiOutputReduction.__init__.<locals>.loaderr+  )r   r  r  r  r   r  )callabler   r  __init__r  )r  r  r-  r  r  r  r  r  r  r  r  r   r  r   r  <  s    


zMultiOutputReduction.__init__r  r3  r  r  r  r  r"  r#  r   r   c              	   C  sX   t | j| j| j| ||}t|ttfsJ t	||| j
 }t |p&d|||S r  )rp   r$  r  r  r  r  r   r   r   r   r  r%  )r  r  r  r  r"  r   r-  r   r   r   r%  a  s   

z$MultiOutputReduction.store_reduction)r  r  r-  r  r  r  r  r  r  r  r  rU   r  r  r  rY   r  r   )
r  r3  r  r  r  r  r"  r#  r   r   )r   r   r   r   r  r%  r  r   r   r   r   r  9  s   
 %r  c                   @  s"   e Zd ZeejdfdddZdS )OnlineSoftmaxReductionNr  r  r-  r  r  r  r  r  r  r  
num_outputr   r  rY   r1  r  r   Sequence[TensorBox]c
                   s<   t  fddt|D }
|
D ]}|  q|
S )z>
        Create the reduction disregarding splitting.
        c                 3  s.    | ]}t t d |	V  qdS )r  N)r   r  r  r   
output_idxr  r-  r  r  r  r  r  r   r   r     s"    
z0OnlineSoftmaxReduction.create.<locals>.<genexpr>)r   r   r	  )r  r  r-  r  r  r  r  r  r  r1  resultsr,  r   r  r   r  t  s   
zOnlineSoftmaxReduction.create)r  r  r-  r  r  r  r  r  r  r  r  r  r  r   r  rY   r1  r  r   r  )r   r   r   r  rY   r,  r  r   r   r   r   r  s  s
    
r  c                   @  s<   e Zd ZeejfdddZedddZedddZ	dS )WelfordReductionr  r  r  r  r  Sequence[Callable[..., Any]]r  r  r  r  rU   r  rY   r   r  c              
     sF  dv sJ t jjrJ dtjjt}dfdd}	|d	kr5|	d	}
|	d	}|	d	}|
||fS |d
kr_dfdd dkrT d	 |	d	|	d
fS t fddD S t	j
d	 |d\}}tjkrw||d
kr| |S fddtdD }|D ]}|  q|S )N)r  r  zGwelford reduction usage is explicitly disabled, please check you configr  r   r   r   c                   s&   d fdd}t j|tdS )	Nr   r  r   rq   c                   s   t  S r   r  r   )r  r  r   r   r    s   z8WelfordReduction.create.<locals>.const.<locals>.inner_fnr  r   r  r   rq   r  r  r   )r  r  )r  r  r  r  r   const  s   z&WelfordReduction.create.<locals>.constr   rD   r  r  c                   s&   d fdd}t j|tdS )	Nr   r  r   rq   c                   r  )Nc                 S  r  r   r  r	  r   r   r   r     r  zKWelfordReduction.create.<locals>.copy.<locals>.inner_fn.<locals>.<listcomp>r   )r   r  )r  r  r   r   r    r  z7WelfordReduction.create.<locals>.copy.<locals>.inner_fnr  r  r  )r  r  )r  r  r  r  r  r   copy  s   z%WelfordReduction.create.<locals>.copyr  c                 3      | ]} |V  qd S r   r   r  )r  r   r   r     r  z*WelfordReduction.create.<locals>.<genexpr>)r  r0  c                   s*   g | ]}t t |	qS r   )r   r  r  r  )r  r  r  r  r  r  r  r   r   r   	  s     z+WelfordReduction.create.<locals>.<listcomp>r   )r  r   r   r   )r  r  r   r   )rE   mtiadisable_welford_reductionrr   r  r  r  rm   r   r  rp  rY   r,  r  r   r	  )r  r  r  r  r  r  r  r  r0  r  meanm2weightr  r  r  r,  r   )r  r  r  r  r  r  r  r  r   r    sZ   




zWelfordReduction.creater   r  c                 C  r  )N)r   r   r   r   r  r   r   r   r  	  r:  zWelfordReduction.default_valuer  r~   c	              
     s&  t tjjttd }	|	r:|dkr:dfd
d}
j||d t	|
ddt	|
ddf|d|dS t
d   t|t fdd|D g | g||}|D ]}|  qbdddtjjt |}||}t|tfdd|D |gd|S )r  r   r  r   r  r  r-  r   r   rq   c                   s   t | S r   r  )r   r  r-  r  r   r   r  5	  s   z4WelfordReduction.create_multilayer.<locals>.constantr0  rD   )r  r  r  r  r  r  r  r  c              	   3  s&    | ]}j | d dV  qdS )r   r  N)r  )r   r  )r  r  r0  r  r  r   r   r   M	  s    	
z5WelfordReduction.create_multilayer.<locals>.<genexpr>r   r  r  r  c                 S  s   |g | |S r   r   )r   r  r  r   r   r   intermediate_loader_fn`	  s   zBWelfordReduction.create_multilayer.<locals>.intermediate_loader_fnc                 3  s     | ]}t  | d V  qdS )r  N)r   r  r   )r  r   r   r   p	  
    
N)r   r  r  r  r-  r   r   rq   )r   r  r  r  r  r  r   rq   )rm   rr   r  r  r  r   rG  rA   r  r   r@   r  r  r   r	  r   r  )r  r  r  r  r  r  r  r  r  r  r  intermediatesr   r7  r   )r  r  r  r  r0  r  r  r   r  	  sb   

	


	z"WelfordReduction.create_multilayerN)r  r  r  r  r  r  r  r  r  r  r  rU   r  rY   r   r  r  )r  r  r  r  r  r  r  r  r  r  r  rU   r  r~   r  rY   r   r  )
r   r   r   r  rY   r,  r  rn  r  r  r   r   r   r   r    s    	{r  c                      s   e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< ded< ed dIdJ fddZdK fddZdLd#d$ZdMd%d&ZdNd'd(Z	dNd)d*Z
dNd+d,ZdOd-d.ZdPd0d1ZdIdJd2d3Zeejfd4d5dQd=d>ZedRdGdHZ  ZS )SScanr  scan_rangesr  =Callable[[tuple[Any, ...], tuple[Any, ...]], tuple[Any, ...]]rx  zFCallable[[Sequence[_IntLike], Sequence[_IntLike]], Sequence[_IntLike]]r   rY   r  r   r  tuple[torch.dtype, ...]dtypestuple[Callable[..., Any], ...]r  Fr  r   r   r  c                   D   t   t j fdd| jD  B t j fdd| jD  B S )Nc                 3  r  r   r%   r  r  r   r   r   	  r  z,Scan.get_free_symbol_uses.<locals>.<genexpr>c                 3  r  r   r%   r  r  r   r   r   	  r  )r  r>  r=   r  r	  r  r=  r   r  r   r>  	  s   
zScan.get_free_symbol_usesr   c                   0   t | jt | j t | jksJ t   d S r   )r   r  r	  r  r  r  r  r   r   r   r  	     "zScan.__post_init__r  r3  r  %Callable[[Sequence[_IntLike]], Never]r  r  	scan_varsr#  r   c                   sR   |  || t fdd| jD }t| j| j|}t|p d| || j S )Nc                 3      | ]}| V  qd S r   r   r   r  r  r   r   r   	  r  z'Scan.store_reduction.<locals>.<genexpr>r  )	r   r   r  rp   r3  r  rx  r  r  )r  r  r  r  r  r   r  r   r  r   r%  	  s   zScan.store_reductionc                 C  r  )Ncustomr   r  r   r   r   r?  	  r  zScan.get_reduction_typec                 C  r  r   )r	  r  r   r   r   r@  	  r  zScan.get_reduction_sizec                 C  r  r   r  r  r   r   r   r!  	  r  zScan.get_sizec                 C  r  r   r  r  r   r   r   r  	  r  zScan.get_pointwise_sizec                 C  r&  r   )r   r  r	  r  r   r   r   r'  	  r   zScan.index_lengthr  c                 C  .   |  | j}|  | jtj}| ||}|fS r   )r  r  r	  rC   r)  r   r  r   r*  r   r   r   r   r  	     zScan.inner_fn_argsc                 C  8   |  | j}|  | jtj}| ||}t| j||dS r  )r  r  r	  rC   r)  r   rM   r  r  r  r   r*  r   r   r   r   r  	     zScan.inner_fn_free_symbolsT)can_fallback_to_atenr  r  +tuple[Callable[[Sequence[Expr]], Any], ...]axisr  r   Sequence[TensorBox | None]c                  s  g d    d d    g	t jtjs$d gt S tdkr9t jtjs9d gt S t jj}
|
t		}ttksNJ |

t|drgfddttD S | jd d  	|d\}t
|dkrtjjd u ptotdkotdk}|s|rd gt S d}nt
d 	fdd	
fddttD }|D ]}|  q|S )NrD   c                   &   g | ]}t j | | d qS r  r  r  r   r  r  r  r  r  r   r   r   	      zScan.create.<locals>.<listcomp>r   )r  r  r  r  pointwise_rangesr	  rx  
scan_numelz3.3.0r   r  
scan_indexr   r  c                   H   t |t ks
J t | t ksJ g | d   ||  d  S r   r   )r   r)  )r  r'  r	  r   r   r   
      zScan.create.<locals>.reindexc                   sB   g | ]}t 	d| | 
 |d qS ))r  r  r  r  r  r  r  r	  rx  r   r  r  r   r  r$  )rx  r  r  r  r   r'  r  r   r	  	scan_typer  r   r   r   
  s*    )r   r  r)  r  r   r  )rr   r  rS  rG   SCANr   TUPLE_REDUCTIONr  r  rm   r  r   rN  r   rp  r  r)  versionhip
has_tritontriton_version	SplitScanr	  )r  r  r  r  r  r  rx  r  r  r   r  r(  rp  supports_splitr  r  r   )r  rx  r  r  r  r   r'  r  r   r	  r-  r  r   r  	  sV    







zScan.creater  r  r  r  r'  r(  r   r2  c	           
   
     s*   d
 fdd}	t j||||	||d|d	S )Nr   r  r  r   rq   c                   s$   g | d   ||  d  S r   r   r  r  r  r   r   r  7
  s   $z#Scan.num_splits.<locals>.wrapper_fnr3  )r  r-  r  r  r  r  r  r0  )r   r  r  r  r   rq   )r  rp  )
r  r  r  r  r  r'  r	  rx  r(  r  r   r6  r   rp  *
  s   zScan.num_splitsr_  r  rI  )
r  r3  r  r  r  r  r  r#  r   r   rU  rS  rf  r  )r  r  r  r  r  r  r  r  r  r   rx  r
  r  rY   r  r   r   r   r   r   )r  r  r  r  r  r  r  r   r'  r  r	  r  rx  r
  r(  r   r   r2  )r   r   r   r   r]   r>  r  r%  r?  r@  r!  r  r'  r  r  r  rY   r,  r  rp  r  r   r   r   r   r  |	  s6   
 






	br  c                   @  s   e Zd ZdS )r4  N)r   r   r   r   r   r   r   r4  G
  s    r4  c                      s   e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< ded< ded< ed d;d< fddZd= fddZd>d"d#Zd?d$d%Zd@d&d'Z	d@d(d)Z
d@d*d+ZdAd,d-ZdBd/d0Zd;d<d1d2ZeejfdCd9d:Z  ZS )DSortr  sort_rangesr  z:Callable[[Sequence[Expr], Sequence[Expr]], Sequence[Expr]]r   rY   r  r   r  r  r  r  r  r   stable
descendingFr  r   r  c                   r  )Nc                 3  r  r   r%   r  r  r   r   r   b
  r  z,Sort.get_free_symbol_uses.<locals>.<genexpr>c                 3  r  r   r%   r  r  r   r   r   e
  r  )r  r>  r=   r  r8  r  r=  r   r  r   r>  ]
  s   
zSort.get_free_symbol_usesr   c                   r  r   )r   r  r8  r  r  r  r  r   r   r   r  i
  r  zSort.__post_init__r  r3  r  r  r  r  r"  r   c                   sV   |  || t fdd| jD }t| j|| j| j}t|p"d| || j	 S )Nc                 3  r  r   r   r  r  r   r   r   u
  r  z'Sort.store_reduction.<locals>.<genexpr>r  )
r   r   r  rp   sortr  r9  r:  r  r  )r  r  r  r  r"  r   r  r   r  r   r%  m
  s   zSort.store_reductionc                 C  r  )Nr;  r   r  r   r   r   r?  {
  r  zSort.get_reduction_typec                 C  r  r   )r8  r  r   r   r   r@  ~
  r  zSort.get_reduction_sizec                 C  r  r   r  r  r   r   r   r!  
  r  zSort.get_sizec                 C  r  r   r  r  r   r   r   r  
  r  zSort.get_pointwise_sizec                 C  r&  r   )r   r  r8  r  r   r   r   r'  
  r   zSort.index_lengthr(  c                 C  r  r   )r  r  r8  rC   r)  r   r  r   r   r   r  
  r  zSort.inner_fn_argsc                 C  r  r  )r  r  r8  rC   r)  r   rM   r  r  r   r   r   r  
  r  zSort.inner_fn_free_symbolsr  r  'tuple[Callable[[list[Expr]], Any], ...]r  r   r   c	                   s<  g 	d   	 d d  	  g
t jtjs$d gt S t jj}
|
t
}t	j
jr8t	j
j}nd}t	j
joF|
t||}|sPd gt S ttksZJ |
t|drs	fddttD S d 
fd
d	
fddttD }|D ]}|  q|S )NrD   r  c                   r!  r"  r#  r$  r%  r   r   r   
  r&  zSort.create.<locals>.<listcomp>r   r  
sort_indexr   r  c                   r*  r   r+  )r   r=  )r  r'  r8  r   r   r   
  r,  zSort.create.<locals>.reindexc                   sD   g | ]}t td| | 	|
 d qS ))r  r  r  r  r  r  r  r8  r   r  r  r9  r:  r   )r   r  r7  r$  )r:  r  r  r  r   r'  r  r   r  r8  r9  r   r   r   
  s,    )r   r  r=  r  r   r  )rr   r  rS  rG   SORTr   r  r  rm   rE   r=  decompose_sort_opspersistent_reductionsr  r   rN  r   r	  )r  r  r  r  r  r  r9  r:  r  r   r  
sort_numelis_persistent_kernel
max_rblockr  r  r   )r  r:  r  r  r  r   r'  r  r   r  r8  r9  r   r  
  s4    





zSort.creater_  r  rI  )
r  r3  r  r  r  r  r"  r  r   r   rU  rS  rf  r  )r  r  r  r  r  r<  r  r  r  r   r9  r   r:  r   r  rY   r   r   r   r   )r   r   r   r   r]   r>  r  r%  r?  r@  r!  r  r'  r  r  r  rY   r,  r  r  r   r   r   r   r7  L
  s0   
 







r7  c                 C  s(   z	t | dd W dS  ty   Y dS w )NFfreezeT)rh  r  r   r   r   r   r"  
  s   r"  c                 C  s@   zt | dd\}}| r|  | W S  ty   Y dS w NFrD  )rh  should_pad_stridespad_stridesis_contiguousr  )r   _bufferrg  r   r   r    is_contiguous_storage_and_layout
  s   
rK  rE  want_contiguousstride_orderSequence[int | Integer] | Noner*  r2  tuple[StorageBox, Layout]c           	      C  s   t | trt| j|||||dS t | tr)t| j|||||d\}}| | j fS t | trc|r[|r?|   |   s>J n|durK| j	||d n|durW| j
||d n|   t| |  fS t | trvt| j|d\}}|| jfS t)z
    Try to simplify x into a StorageBox and a Layout.

    allow_padding only affect how we apply stride_order. When allow_padding
    is True, we have the freedom to add padding when applying the stride_order.
    rE  rL  rM  r*  r2  Nr*  rD  )r   r   rh  rf  r  r#  r  r)  rI  r,  r4  rJ  rl  rg  r  )	r   rE  rL  rM  r*  r2  r
  rg  bufferr   r   r   rh  
  sR   






rh  c                 C  s2   zt | dd\}}||W S  ty   Y dS w rF  )rh  is_stride_orderedr  )r   rM  rJ  rg  r   r   r   "is_stride_order_storage_and_layout;  s   rT  r   c                 C  sr   t | ttfrt| jS t | tr*| j}tjj	
|jt|j t }t| jp)|S t | tr7|  tjjv S dS r  )r   r   r  is_unalignedrf  rl  rg  rr   r  r  statically_known_multiple_ofrj  rd   r  rf   r  r  unaligned_buffers)r   rg  has_unaligned_layoutr   r   r   rU  E  s   


rU  c                   @  s   e Zd ZU ded< ed dEdFdd	ZdGddZdHddZdIddZe	dJddZ
dKddZdLddZdMddZdNd d!ZdOd#d$ZdPd(d)ZdQd*d+ZdRd-d.ZdSd/d0ZdTd2d3ZdQd4d5ZdQd6d7ZdUd9d:ZdVd<d=ZdWd>d?ZdXdBdCZdDS )Yr  r   rf  Fr  r   r   r  c                 C     | j |S r   rf  r>  r=  r   r   r   r>  [     zBaseView.get_free_symbol_uses*Callable[[Sequence[Expr]], Sequence[Expr]]c                 C  s   t d|  )Nzmake_reindexer NYI on rv  r  r   r   r   make_reindexer_  r   zBaseView.make_reindexerr  c                   &   | j   |  d fdd}|S )Nr   r  r   r   c                   r   r   r   r  innerr   r   r   r  f  r   z&BaseView.make_indexer.<locals>.indexer)r   r  r   r   )rf  r  r]  r  r  r   r_  r   r  b     
zBaseView.make_indexerr  c                   r^  )Nr   r  r   rq   c                   r   r   r   r  r_  r   r   r  o  r   z$BaseView.make_loader.<locals>.loaderr  )rf  r  r]  r  r  r   r_  r   r  k  rb  zBaseView.make_loaderr  c                 C  
   | j  S r   )rf  r&  r  r   r   r   r  t  r  zBaseView.dtyper  c                 C  rd  r   rf  r#  r  r   r   r   r#  x  r  zBaseView.get_layoutr  c                 C  rd  r   rf  r'  r  r   r   r   r'  {  r  zBaseView.get_devicer  c                 C  r  r   r   r  r   r   r   r  ~  r  zBaseView.get_origin_noder   c                 C  rd  r   rf  r  r  r   r   r   r    r  zBaseView.get_namer  c                 C  r  r   r  r  r   r   r   r    r  zBaseView.get_pointwise_sizer%  r   r   c                 C  rY  r   rf  r'  r&  r   r   r   r'    r   zBaseView.mark_reusec                 C  rd  r   rf  r  r  r   r   r   r    r  zBaseView.has_exceeded_max_readsr3  c                 C  rd  r   rf  r	  r  r   r   r   r	    r  zBaseView.realizec                 C  s   | j   d S r   rf  r(  r  r   r   r   r(    r   zBaseView.realize_hintr~   c                 C  rd  r   rf  r<  r  r   r   r   r<    r  zBaseView.get_storage_numelc                 C  rd  r   rf  rA  r  r   r   r   rA    r  zBaseView.is_externc                 C  s$   t | jtsJ t| j| j S r   )r   rf  r  r   is_module_bufferr  r   r   r   rn    s   
zBaseView.is_module_bufferr  c                 C  rd  r   rf  r  r  r   r   r   r    r  zBaseView.get_read_namesr7  c                 C  sF   t tdd t|  |  jW  d    S 1 sw   Y  d S r  )r   r   r$  rO   r  r!  r:  r  r   r   r   r    s   $zBaseView.get_readsc                 C  s"   | }t |tr|j}t |ts|S r   )r   r  rf  )r  r   r   r   r   r    s
   

zBaseView.unwrap_viewr  r  c                 C  s2   |   }ttd||}t||  ||  dS r  )r  r   r   r  r  r&  r!  r  r   r   r   rC    s   zBaseView.constant_to_deviceNr_  r  r   r\  rZ  rY  rO  rP  rW  rK  r\  rS  r^  rR  rU  rI  rg  rJ  re  rH  ri  )r   r   r   r   r]   r>  r]  r  r  rq  r  r#  r'  r  r  r  r'  r  r	  r(  r<  rA  rn  r  r  r  rC  r   r   r   r   r  W  s2   
 


		














r  c                   @  sF   e Zd ZU ded< eddd	ZedddZdddZdddZ	dS )r   r  r  r   r   new_sizerU  r   c                 C  s   t jj}dd |D }|  }dgt|t|  t| }t|t|ks(J tt|D ]_}|| dkrE|| dus>J || ||< q.|| du sTt jj|| rUq.t|st|s|| }|| }|duskJ |dusqJ || }|j	|dddksJ d|   d| d	| q.|S )
zReplace `-1` with correct sizesc                 S  rz  r   r{  r   r   r   r   r     r   z.ExpandView._normalize_size.<locals>.<listcomp>NrI  r   fallbackzBroadcast failed in ExpandView(, z) on dimension )
rr   r  r  r!  r   r   r   is_size_one_or_falser5   r   )r   rq  r  old_sizer   v1v2diffr   r   r   _normalize_size  sB   zExpandView._normalize_sizer  c           
      C  s   |  ||}t|rWt|\}}t|t|j }|dksJ tjjg| }t|j	|jD ]\}}|
tjj|s<|ntjj q-t|j|jt|||j|j}	t||	dS t||dS )Nr   re  )rf  r  )rz  r"  rh  r   r  r   r  r  r   r  r  rr   r  r  ru  ri  r  r  r   rj  rk  rl  r   )
r  r   rq  rm  rn  skipro  r  r  rp  r   r   r   r    s,   zExpandView.createc                 C  r  r   r  r  r   r   r   r!     r  zExpandView.get_sizer\  c                   s6   |   }| j   t|t  d fdd}|S )Nr   r  r   c                   sR   t | d  } t| t ksJ tt D ]} | dkr&tjj| |< q| S r8  )r   r   r   r   r  r  )r   r   actualr{  r   r   r   
  s   z*ExpandView.make_reindexer.<locals>.reindexr   r  r   r  )r!  rf  r   )r  targetr   r   r|  r   r]    s
   
zExpandView.make_reindexerN)r   r   rq  rU  r   rU  )r   r   rq  rU  r   r  rS  rp  )
r   r   r   r   rn  rz  r  r  r!  r]  r   r   r   r   r     s   
 '
r   c                   @  sF   e Zd ZU ded< eddd	ZedddZdddZdddZdS )PermuteViewr  dimsr   r   r   r   r  c                   s   |  |}t|ttt|ksJ t|r>t|\} t j j fdd|D  fdd|D  j	 j
}t||dS t||dS )Nc                      g | ]} j | qS r   r  r   rn  r   r   r   &  r   z&PermuteView.create.<locals>.<listcomp>c                   r  r   r  r   r  r   r   r   '  r   re  )rf  r  )_map_neg_dimsr=   r   r   r"  rh  ri  r  r  rj  rk  rl  r  )r  r   r  rm  rp  r   r  r   r    s   
zPermuteView.create	list[int]c                   s    fdd D S )Nc                   s$   g | ]}|d kr
|nt  | qS r  r+  )r   rY  r  r   r   r   1  s   $ z-PermuteView._map_neg_dims.<locals>.<listcomp>r   )r  r  r   r  r   r  /     zPermuteView._map_neg_dimsr  c                   sD   t | | jt tt| jksJ | j   fdd| jD S )Nc                   r   r   r   r   r  r   r   r   8  r  z(PermuteView.get_size.<locals>.<listcomp>)r=   r  r  r   r   rf  r!  r  r   r  r   r!  3  s
   

zPermuteView.get_sizer\  c                   s`   dd t | jD   fddtt| jD  t ttt| jks'J d
 fdd	}|S )Nc                 S  r   r   r   )r   r   rn  r   r   r   r   =  r   z.PermuteView.make_reindexer.<locals>.<dictcomp>c                   r   r   r   r   invr   r   r   >  r  z.PermuteView.make_reindexer.<locals>.<listcomp>r   r  r   c                   s    fddD S )Nc                   r   r   r   r   r   r   r   r   D  r  z?PermuteView.make_reindexer.<locals>.reindex.<locals>.<listcomp>r   r   r  r   r   r   A     z+PermuteView.make_reindexer.<locals>.reindexr~  )r  r  r   r   r=   )r  r   r   r  r   r]  :  s
   zPermuteView.make_reindexerN)r   r   r  r   r   r  )r  r   r   r  rS  rp  )	r   r   r   r   r  r  r  r!  r]  r   r   r   r   r    s   
 
r  c                   @  s8   e Zd Zeddddd	ZedddZdddZdS )SqueezeViewNrY  r   r   rY  r  r   c                  sL  t |rzt|\}}g }g } d ur*t tsJ t d kr( t|jk s*J tt|j|j	D ]3\}\}}	 d u rOt
jj|sN|| ||	 q3| kr^|| ||	 q3|dksfJ dq3t|j|j|||j|j}
t||
dS  d u rt|dd | D S |   dksJ t| fddt| D S )Nr   rD   zexpected squeezed size to be 1re  c                 S  s   g | ]}t jj|s|qS r   )rr   r  r  ru  r   r   r   r   r   o  s    z&SqueezeView.create.<locals>.<listcomp>c                   s   g | ]
\}}| kr|qS r   r   r   r   r   r  r   r   r   w      )r"  rh  r   r   r   r   r  r  r   r  rr   r  r  ru  r  ri  r  r  rj  rk  rl  r  r  r!  )r  r   rY  rm  rn  rq  ro  r   r  r  rp  r   r  r   r  K  sF   


	"zSqueezeView.creater  r  >tuple[list[int], Callable[[Sequence[Expr]], tuple[Expr, ...]]]c                   s@   dd | D }dd t | D t|  d
 fdd	}||fS )Nc                 S  s   g | ]}|d kr|qS r  r   r   r   r   r   r   }  r   z(SqueezeView.squeezer.<locals>.<listcomp>c                 S  s   g | ]
\}}|d kr|qS r  r   r  r   r   r   r   ~  r  r   r  r   tuple[Expr, ...]c                   sT   t | t ksJ |  d tjjg  }t| D ]\}}|||< qt|S )N )r   r   r  r  r   r   )r   r  r   r   lengthnot_oner   r   r     s
   "
z%SqueezeView.squeezer.<locals>.reindex)r   r  r   r  )r  r   )r  rq  r   r   r  r   squeezery  s
   zSqueezeView.squeezerrf  r   r   c                 C     t d)Nzuse SqueezeView.create())AssertionError)r  rf  r   r   r   r    r  zSqueezeView.__init__)r   r   rY  r  r   r   )r  r  r   r  )rf  r   r   r   )r   r   r   r  r  rn  r  r  r   r   r   r   r  I  s    -r  c                   @  sX   e Zd ZU ded< ded< dddZdd	d
ZdddZeZedddZ	dddZ
dS )GenericViewr  r  r\  r   r   c                 C  r  r   )r   r  r   r   r   r]    s   zGenericView.make_reindexerr   c                 C  sB   dd t t| jD }t| |}ddtt| d| S )Nc                 S     g | ]}t tj|qS r   )rl   rC   r  r   r  r   r   r   r         z+GenericView.reindex_str.<locals>.<listcomp>zlambda rt  r4  )r   r   r  r   r   r  r  r   )r  	index_old	index_newr   r   r   reindex_str  s
   zGenericView.reindex_strc                 C  s$   |  | jd| j d|   gS )Nsize=zreindex=)r  rf  r  r  r  r   r   r   r    s   zGenericView.__str__r   r   rq  r  c                 C  s   | |t ||dS )Nrf  r  r   )r   )r  r   rq  r   r   r   r   r    s   zGenericView.createc                 C  r  r   r  r  r   r   r   r!    r  zGenericView.get_sizeNrp  r\  )r   r   rq  r  r   r\  r   r  rS  )r   r   r   r   r]  r  r  r  r  r  r!  r   r   r   r   r    s   
 


r  c                   @  sb   e Zd ZdZedddZeedddZedddZ	e	dd ddZ
e	dd!ddZdS )"r  z
    This class handles tensor reshaping by computing appropriate index transformations
    to map the new shape back to the original storage layout.
    r   r   r  r   c                 C  s<   t | } t |}tjjjj}|t | dr| | } | S r  )r   r|  rr   r  r  r   evaluate_exprLt)r   r  r  r   r   r   handle_negative_index  s   

zView.handle_negative_indexr   r   rq  r  c                   sr  t tsJ t | \tjjr|S t	t
dkp.t	t
dk}t|}dd	d
d fdd}dv rVdfdd} |t|dS |ra|tS t|si||S t|dd\}}|j}	tjj}
tjj|	}tjj}ddlm} ||
|||d}|d urdd |D }t|j|j||j|j}t||dS ||S )Nr   r  r   rq  r  ro  r   rl  c                 S  s6   t | dd\}}t|j|j|||j|j}t||dS )NT)rL  re  )rh  ri  r  r  rj  rk  rl  )r  rq  ro  rm  rn  rp  r   r   r   create_reinterpret_view  s   z,View.create.<locals>.create_reinterpret_viewr   c              	     sP   z  } | t|dW S  ty'   t| } | t Y S w )a  
            Handle the case where view is not possible with current strides.
            Try dynamic_reshape_indexer first; if it fails with unbacked
            symbols (guard_or_false can't resolve comparisons), fall back
            to making the tensor contiguous.
            r  )r  r   r4   r  require_contiguousr$  r%  )r   r   r  r  rq  rv  r   r   "handle_unbacked_or_dynamic_reshape  s   

z7View.create.<locals>.handle_unbacked_or_dynamic_reshaper   r   tuple[int, ...]c                   s   t dgt  S r  )r   r   r   )rv  r   r   fake_reindex  r  z!View.create.<locals>.fake_reindexr  FrD  )_compute_stridesize_obliviousc                 S  s(   g | ]}t |d r|jjnt|qS r  )r  r   exprr   r   r   r   r   r   r      s    zView.create.<locals>.<listcomp>re  )r  r   rq  r  ro  r  r   rl  r   r   r   r   )r   r   r   r  )r   r   r   resolve_negative_sizer!  rr   r  r  statically_known_list_equalsr   r3   rK  r   r$  r%  r"  rh  r  to_symints_or_intstorch._subclasses.fake_implsr  ri  r  r  rj  rk  rl  )r  r   rq  unbacked_symbols_in_sizesrI  r  r  rm  rn  
old_strideold_size_symintold_stride_symintnew_size_symintr  new_stride_symintro  rp  r   r  r   r    sZ   
zView.createrv  tuple[list[Expr], list[Expr]]c                 C  s   dd |D }dd | D } t |}tt|D ]}|| dkr3tjj||< tt| t|||<  nqtj	j
t| t| | |fS )Nc                 S     g | ]	}t jj|qS r   rr   r  r  r  rt  r   r   r   r   6      z.View.resolve_negative_size.<locals>.<listcomp>c                 S  r  r   r  rt  r   r   r   r   7  r  rI  )r   r   r   r   r  Oner?   rm   rr   r  r  check_equals)rv  rq  r   r   r   r   r  2  s   zView.resolve_negative_sizeNrU  	dense_dimr  r   c              
   C  s\   z
|  |||}W |S  tttfy-   t|g}|  ||}|  ||}t||}Y |S w r   )_dynamic_reshape_indexerr  r4   
IndexErrorrm   r   )r  rv  rq  r  r   flatr   r   r   r   r   r  C  s   
zView.dynamic_reshape_indexerr\  c                   s  t jjj d fdd}dd	 tt|D tt|}t| }|d
uo3|t|d ko3t|dk}|rF|d
us<J ||}|	| g |r|r| }| \}	}
|dkri	t
jj |	|	|
f n|
dkrs|	| n||
|dkr	|	 n{||
|dk r||
|dk r| \}}||
 |	 }	|
| }
||
|dk s	|	 t jj|
| nH||
|dkrt
jj}|}	t|	|| || }||
|dkr| }	t|	|| || }|| }||
|dkst jj|
| nt|r|sL|r| }t jj|d 	t
jj |s|r,| \}	}
t jj|
d |s|d
urGt|dkrG   }|| n  tt| ksVJ dfdd}|S )zG
        Perform a reshape entirely by modifying indexing math
        r  r   r  r   r   c                   st    t | |r
dS  t | |rdS  t | |rdS tjj|| r(dS tjj| |r2dS tt | |)z
            Compare two symbolic sizes, returning -1 if a < b, 0 if a == b, 1 if a > b.

            For unbacked symbols, guard_or_false returns False, so we fall back
            to divisibility checks.
            r   rI  rD   )	r   rG  r  Gtrr   r  r  rV  r4   )r  r  )rO  r   r   compare_sizes_  s   z4View._dynamic_reshape_indexer.<locals>.compare_sizesc                 S  r  r   )rl   rC   VIEWr   r   r   r   r     r  z1View._dynamic_reshape_indexer.<locals>.<listcomp>NrD   r   r   r  c                   sH   t | t ksJ t | t ftt|  t fddD S )Nc                 3  r  r   )rn   rt  replacementsr   r   r     r  zAView._dynamic_reshape_indexer.<locals>.reindex.<locals>.<genexpr>)r   r   r   r   r   )r  	view_exprr  r   r     s   $z.View._dynamic_reshape_indexer.<locals>.reindex)r  r   r  r   r   r   r~  )rr   r  r  rO  r   r   r   r   r  r  r   r  r  r  r  rB   r  reverseinsert)rv  rq  r  r  	stack_new	stack_oldreordering_dense_dimold_dimsize_oldvarsize_newvar2	size_new2divisormodulus
dense_exprr   r   )rO  r  r  r   r  T  s   
	$




zView._dynamic_reshape_indexer)r   r   r  r   r   r   )r   r   rq  r  r   r   )rv  r  rq  r  r   r  r   )rv  rU  rq  rU  r  r  r   r   )rv  r  rq  r  r  r  r   r\  )r   r   r   rk  rn  r  r  r   r  r  r  r  r   r   r   r   r    s    mr  c                      s   e Zd ZU dZded< d3 fddZd4d	d
ZeZd4ddZd5ddZ	d6ddZ
ed7ddZd8ddZd8ddZd9ddZd:dd Zd;d!d"Zd3d#d$Zed 	%d<d=d)d*Zd>d?d.d/Zd@d1d2Z  ZS )Arl  z*Pretend our storage has a different layoutr  rg  r   r   c                   s2   t    t| jtrt| d| j  d S d S )Nrf  )r  r  r   rf  r  r   r  r  r  r   r   r   r    s   
zReinterpretView.__post_init__r   c                 C     |  | j| jgS r   )r  rf  rg  r  r   r   r   r    s
   zReinterpretView.__str__c                 C  rd  r   rg  r  r   r   r   r    r  zReinterpretView.get_namer  c                 C     | j jS r   )rg  r  r  r   r   r   r'    r  zReinterpretView.get_devicer  c                 C  r  r   r   r  r   r   r   r    r  zReinterpretView.get_origin_noder  c                 C  r  r   )rg  r  r  r   r   r   r    r  zReinterpretView.dtyper  c                 C     t | jjS r   )r   rg  r  r  r   r   r   r!    r   zReinterpretView.get_sizec                 C  r  r   )r   rg  r  r  r   r   r   rK    r   zReinterpretView.get_strider  c                      d fdd}|S )Nr   r  r   rq   c                   sF    j  }t  || } j j jjkr!t| j jjS |S r   )rg  r  rp   loadr  r  rf  to_dtype_bitcast)r   r  
tmp_loaderr  r   r   r    s
   
z+ReinterpretView.make_loader.<locals>.loaderr   r  r   rq   r   rc  r   r  r   r    s   zReinterpretView.make_loaderr  c                 C  rd  r   )rg  r  r  r   r   r   r    r  zReinterpretView.make_indexerc                 C  r  r   rg  r  r   r   r   r#    r  zReinterpretView.get_layoutc                 C  r  r   r   r  r   r   r   r)    r  zReinterpretView.freeze_layoutFr  r   r   c                 C  s*   t | jj|t | jj|B t | jj|B S r   )r&   rg  r  r  rj  r=  r   r   r   r>    s   z$ReinterpretView.get_free_symbol_usesNr
  r  c                 C  s@   t jjj| j| jj| jj| jj|d ur|j	nt jjj	| jj
dS r  )rr   r  wrapper_codecodegen_reinterpret_viewrf  rg  r  r  rj  	writeliner  r  r   r   r   r    s   z!ReinterpretView.codegen_referencer   c                 C  r  r8  r   r  r   r   r   r;  %  r  zReinterpretView.num_readsrI  r\  rW  rK  rO  rS  rY  rZ  rP  r_  rh  r   rV  rf  )r   r   r   rk  r   r  r  r  r  r'  r  rq  r  r!  rK  r  r  r#  r)  r]   r>  r  r;  r  r   r   r   r   rl    s,   
 









	rl  c                   @  sX   e Zd ZU dZded< edd	d
ZdddZeZe	dddZ
dddZdddZdS )	DtypeViewz(Pretend our storage has a different typer  target_dtyper   r   	new_dtyper   r  c                 C  sH   t |rt|\}}t|j||j|j|j|j}t||dS t	||dS )Nre  )rf  r  )
r"  rh  ri  r  r  r  rj  rk  rl  r  )r  r   r  rm  rn  rp  r   r   r   r  /  s   zDtypeView.creater   c                 C  r  r   )r  rf  r  r  r   r   r   r  >  r  zDtypeView.__str__c                 C  r  r   )r  r  r   r   r   r  C  r~  zDtypeView.dtyper  c                 C  rd  r   rf  r!  r  r   r   r   r!  G  r  zDtypeView.get_sizer  c                   s   j   d fdd}|S )Nr   r  r   rq   c                   s   t  | jjjS r   )rp   r  r  rf  r  r  r`  r  r   r   r  M  s   z%DtypeView.make_loader.<locals>.loaderr  rf  r  rc  r   r  r   r  J  s   
zDtypeView.make_loaderN)r   r   r  r  r   r  r\  rO  rS  rY  )r   r   r   rk  r   r  r  r  r  rq  r  r!  r  r   r   r   r   r  )  s   
 

r  c                   @  s2   e Zd ZdZedd
dZe		ddddZdS )	SliceViewzqView that represents a slice along a single dimension.

    Corresponds to tensor[..., start:end:step, ...].
    r   r   rY  r   startendr   tuple[int, int]c                   s   t jj| | tdd ||fD rtjtjntdd ||fD r0tjtjnjj	dfd
d d fdd}||dd}|||}||fS )zz
        Normalize start and end such that both are in the range
        [0, x.get_size()[dim]] and start <= end.
        c                 s      | ]}t |V  qd S r   )r3   rt  r   r   r   r   d  r  z0SliceView.normalize_start_end.<locals>.<genexpr>c                 s  s*    | ]}t |tr|tjtjV  qd S r   )r   r   hasr   MinMaxrt  r   r   r   r   g  s    
r   r   lowerr   upperr   c                   s<    | |r| n | |}||r|}|S ||}|S r   )statically_known_geqrX  )r   r  r  clamped_lowerclamped_full)max_funcmin_funcr  r   r   clamps  s   
z,SliceView.normalize_start_end.<locals>.clampr  r  r  
Expr | intc                   s$   | d u r|S  | }  | ||S r   )r  )r  r  r  r  )r  r  dim_sizer   r   
clamp_wrap~  s   z1SliceView.normalize_start_end.<locals>.clamp_wrapr   N)r   r   r  r   r  r   r   r   )
r  r  r  r   r  r   r  r  r   r  )
rr   r  r  r!  r  r   r  r  evaluate_minevaluate_max)r  r   rY  r  r  r  r   )r  r  r  r  r  r  r   normalize_start_endY  s"   	zSliceView.normalize_start_endrD   Tstepr  r   c                   s  t ttsdksJ zdkr"|dkr"dkr"|W S W n	 ty,   Y nw t| |r?| | |\}t| d   < t	|r}t
|\}}t|j}	|	   |	 < t|j|j|	|j|j    |j}
t||
dS d fdd	}t||d
S )Nr   l    rD   re  r   r  r   c                   sD   t | t ksJ d|  d t| } |     |  < | S )Nzwrong ndim r  )r   r   r   rY  rq  r  r   r   r   r     s   $z!SliceView.create.<locals>.reindexr  r~  )r   r|  r   r   	TypeErrorr   r!  r  r@   r"  rh  r  ri  r  r  rj  rk  rl  r  )r  r   rY  r  r  r   r  rm  rn  ro  rp  r   r   r  r   r    s8   


	zSliceView.createN)
r   r   rY  r   r  r   r  r   r   r  )rD   T)r   r   rY  r   r  r   r  r   r   r   r  r   r   r   )r   r   r   rk  r  r  r  r   r   r   r   r  S  s    1r  c                   @  sF   e Zd ZU ded< ded< dddZdd
dZdddZdddZdS )BaseConstantr  r  r  r  r   r  c                 C  r  Nr   r   r  r   r   r   r!    r  zBaseConstant.get_sizer  c                 C  r  r   r  r  r   r   r   r'    r  zBaseConstant.get_devicer  c                 C  r  r   r   r  r   r   r   r    r  zBaseConstant.get_origin_noder7  c                 C  r  r   r<   r  r   r   r   r    r  zBaseConstant.get_readsNrS  rW  rK  re  )r   r   r   r   r!  r'  r  r  r   r   r   r   r    s   
 


r  c                   @  sD   e Zd ZU ded< ded< ded< dd	d
ZdddZdddZdS )Constantr   r-  r  r  r  r  r   r  c                   r  )Nr   r  r   rq   c                      t  j jS r   )rp   r  r-  r  r   r  r   r   r    r  z$Constant.make_loader.<locals>.loaderr  r   rc  r   r  r   r       zConstant.make_loaderr3  c                 C  r  r   r   r  r   r   r   r	    r  zConstant.realizer   c                 C     t | j| j|dS )N)r-  r  r  )r  r-  r  r  r   r   r   rC    r  zConstant.constant_to_deviceNrY  rU  ri  )r   r   r   r   r  r	  rC  r   r   r   r   r    s   
 

r  c                   @  s:   e Zd ZU ded< ded< ded< dd	d
ZdddZdS )IndexingConstantr   r   r  r  r  r  r   r  c                   r  )Nr   r  r   rq   c                   r  r   )rp   r}  r   r  r   r  r   r   r    r  z,IndexingConstant.make_loader.<locals>.loaderr  r   rc  r   r  r   r    r  zIndexingConstant.make_loaderr   c                 C  r  )N)r   r  r  )r	  r   r  r  r   r   r   rC    r  z#IndexingConstant.constant_to_deviceNrY  ri  )r   r   r   r   r  rC  r   r   r   r   r	    s   
 
r	  c                 C  sp   d}d}t tt|| D ](\}}|dkrqtjj||s)tjj||s) dS |td|9 }||9 }qdS NrD   FT)	reversedr   r   rr   r  r  rb  r   r  )r  rW  expected_strideexpected_stride_maxr   yr   r   r   is_contiguous_strides_for_shape  s   
r  c                 C  s   t j| j S r   )rE   padding_alignment_bytesitemsizer  r   r   r   get_align_for_dtype  r   r  c                   @  s2   e Zd ZdZdddZdddZ		ddddZdS )r6  zxAbstract base for Layout, MultiOutputLayout, NoneLayout.
    Represents the memory layout of the output of an Operation.r   r  c                 C  r  r   r  r  r   r   r   r'    r   zOutputSpec.get_devicer   c                 C  r  r   r  r  r   r   r   storage_size  r   zOutputSpec.storage_sizeFr  r   r   c                 C  r  r   r  r=  r   r   r   r>    r-  zOutputSpec.get_free_symbol_usesNrW  rf  r_  rh  )r   r   r   rk  r'  r  r>  r   r   r   r   r6    s    

r6  c                   @  s8  e Zd ZdZdeddfdMddZedNddZejdOddZedNddZ	e	jdOddZ	edPddZ
e
jdQddZ
dRd d!ZeZdSd"d#ZdTd%d&ZdUd'd(ZedVd,d-ZdUd.d/ZdWd2d3ZdUd4d5ZedXd7d8ZdYd9d:ZdUd;d<ZdZd>d?Zd[dAdBZd\dEdFZdPdGdHZed 	d]d^dKdLZdS )_r  zo
    Layout base class

    Carries tensor meta-information including offset and
    whether it is pinned.
    Nr   Fr  r  r  r  r  r  r  Sequence[Expr] | Nonerj  r   rk  r   r   r   c                 C  s   |d u r	t |}|| _|| _t|t|ks!J d| d| tdd |D s,J || _|| _|| _|| _	| j	rE| jj
dksGJ dd S d S )Nr  	, stride=c                 s  s    | ]
}t |ttfV  qd S r   )r   r   r   r   r   r   r   r   7      z"Layout.__init__.<locals>.<genexpr>r9  zOnly CPU tensors can be pinned)r$  r%  r  r  r   r   _size_stride_offsetrk  r   )r  r  r  r  r  rj  rk  r   r   r   r  (  s   	
$zLayout.__init__c                 C  r  r   r  r  r   r   r   r  A  r~  zLayout.sizer-  c                 C  
   || _ d S r   r  r  r-  r   r   r   r  E  r  c                 C  r  r   r  r  r   r   r   r  I  r~  zLayout.stridec                 C  r  r   r  r  r   r   r   r  M  r  c                 C  r  r   r  r  r   r   r   rj  Q  r~  zLayout.offsetc                 C  r  r   r  r  r   r   r   rj  U  r  r   c                 C  s   d}| j dkrd| j  }| jjd u rdnd| jj }d}| jr'd| j }t| j d| jj | d| j d| j d	| j | | d
S )Nr  r   z	, offset=:z, is_pinned=z('z', z, size=r  r5  )	rj  r  r   rk  r   r   r  r  r  )r  rj  device_index_stris_pinned_strr   r   r   r  Y  s"   
"zLayout.__str__c                 C  r  r   r  r  r   r   r   r'  i  r  zLayout.get_devicer  c                 C  sP   t j tjt| jt| j| j| j| j	dW  d    S 1 s!w   Y  d S )N)r  r  
pin_memory)
rr   	fake_moder)  r*  r`   r  r  r  r  rk  r  r   r   r   get_examplel  s   $zLayout.get_examplec                 C  s   t | j| jS r   )r  r  r  r  r   r   r   rI  v  r   zLayout.is_contiguousrW  rU  r^  c                 C  sV   t | }|dvs| d dkrdS t|t| | D ]\}}}|dkr(||kr( dS qdS )N)r      rD   FT)r   r   r.   )rW  r^  ndimleftrightr  r   r   r   is_channels_last_contiguousy  s   z"Layout.is_channels_last_contiguousc                 C  sJ   t | jtttt| j| jD ]\}}}|dkr"||kr" dS qdS r
  )r   r  r  r$  r%  r   r  )r  r'  r(  r  r   r   r   is_transposed  s   zLayout.is_transposedr   r   c                   s   t jt  ksJ dd tjD }fdd|D } fdd|D  ddd	}|  d
gt   }tt  D ]
}|| | | < q=tt  d D ]'}|| ||d  k}t|tsrtjj	j
|| ||d  kdd}|rw dS qPdS )Nc                 S  s*   g | ]\}}t jjj|d ddkr|qS )r   rr  rD   )rr   r  r  r   )r   r   rY  r   r   r   r     s
    z,Layout.is_stride_ordered.<locals>.<listcomp>c                   r  r   r  r   r  r   r   r     r   c                   r   r   r   r   r   r   r   r     r  arrr   r   c                   s   t |   fdd| D S )Nc                      g | ]}  |qS r   r   )r   element
sorted_arrr   r   r     r   zDLayout.is_stride_ordered.<locals>.sorted_indices.<locals>.<listcomp>)rH  )r+  r   r.  r   sorted_indices  s   z0Layout.is_stride_ordered.<locals>.sorted_indicesrI  rD   Tr  F)r+  r   r   r   )r   r  r  r  r   r   r   rr   r  
_shape_envr  )r  r   non_1_indicesr  r0  stride_orderedr   r  r   )r   r  r   rS    s*   

zLayout.is_stride_orderedc                 C  s:   dgt ttdt| jd  }t|g| }| |S Nr   rD   )r   r  r   r   r  rS  r.  r   r   r   is_channels_last_stride_ordered  s   "
z&Layout.is_channels_last_stride_ordered
in_stridesc                   s  t |}t| dkr| S tjst|| r| S t }t|dr)|j	
ddr)| S tdd t| |D  }tjs>|r>| S ttjdrHtjjnddfdd r`t fdd| D r`| S t| }t|}dd tt| D }d||d < d}	t|dd ddD ]?\}
}||
d  }|| ||  }t|ttjfo|tjko|| dkpt|tjotj}|||< |rt||| ||< d}	q|	s| S t jd7  _|S )z
        The padding does not change stride order but makes sure all strides larger
        than the threshold are multiple of align.
        r   rz  dislike_paddingFc                 s  r   r   r   r   r   r   r   r     r  z&Layout._pad_strides.<locals>.<genexpr>r1  Nr  sympy.Expr | intr   r   c                   s4    d u rdS t | tjsdS t fdd| jD S )NFc                 3  s    | ]}  |V  qd S r   )is_unbacked_symintr   r   r   r   r     r  zILayout._pad_strides.<locals>.contains_unbacked_symints.<locals>.<genexpr>)r   r   r   r  r2   r  r:  r   r   contains_unbacked_symints  s
   z6Layout._pad_strides.<locals>.contains_unbacked_symintsc                 3  r  r   r   r   )r<  r   r   r     r  c                 S  r  r  r   r	  r   r   r   r     r  z'Layout._pad_strides.<locals>.<listcomp>rD   )r  T)r  r8  r   r   )r  r   rE   pad_channels_lastr  r)  rr   get_current_noder  rz  r  r   rv  chainpad_dynamic_shapesr  r1  r  r  r  r   r  r   r   r   r   padding_stride_thresholdr   r^   r$   num_comprehensive_padding)r6  r  r  aligncurrent_fx_noderh   rM  r  new_stridespaddedrankr   prev_idxr  require_paddingr   )r<  r   r   _pad_strides  sZ   





zLayout._pad_stridesc                 C  s>   t | tsJ t| | jd usJ | | j| j| j| _d S r   )r   r$  r   r  rJ  r  r  r  r   r   r   rH    s   zLayout.pad_stridesc                 C  s   t jot| tS r   )rE   comprehensive_paddingr   r$  r  r   r   r   rG    r  zLayout.should_pad_stridesri  c                 C  s<   t | tr| S |  r|   t| j| j| j| j| j| j	S r   )
r   ri  rG  rH  r  r  r  r  rj  rk  r  r   r   r   as_fixed  s   
zLayout.as_fixedr  c                 C  s(   t jsJ dt| j d|   S )Nzconvert z to FixedLayout first)r$  r  r   r   rL  r  r  r   r   r   r  $  s   zLayout.make_indexerotherr   c                 C  sR   t |to(| j|jko(| j|jko(| j|jko(| j|jko(| j|jko(| j|jkS r   )r   r  r  r  r  r  rj  rk  )r  rM  r   r   r   __eq__*  s   






zLayout.__eq__c                 C     t | j| j| jS r   )r*   r  r  rj  r  r   r   r   r  5  r  zLayout.storage_sizer  r   c                 C  s$   t | j|t | j|B t | j|B S r   )r&   r  r  rj  r=  r   r   r   r>  8  s   


zLayout.get_free_symbol_uses)r  r  r  r  r  r  r  r  rj  r   rk  r   r   r   rS  r-  r  r   r   rT  r-  r   r   r   r\  rX  )r   r  rR  )rW  rU  r^  rU  r   r   )r   r   r   r   )r6  r   r  r  r  r  r   r   rI  r   ri  rZ  )rM  r   r   r   r_  rh  )r   r   r   rk  r   r  rq  r  setterr  rj  r  r  r'  r$  rI  rn  r)  r*  rS  r5  rJ  rH  rG  rL  r  rN  r  r]   r>  r   r   r   r   r    sN    








#
N




r  c                   @  s   e Zd ZdZdddZdS )ri  z A Tensor layout we cannot changer   r  c                 C  rO  )r  )r  r  r  rj  r  r   r   r   r  F  r  zFixedLayout.make_indexerNrZ  )r   r   r   rk  r  r   r   r   r   ri  C  s    ri  c                      s  e Zd ZdZdZdDddZedEd
dZedFddZedGddZ	edHddZ
edIddZedJddZejdKddZedJdd ZejdKd!d ZedLd#d$ZejdMd%d$Z	dNdOd(d)Z	dNdPd+d,ZdQd-d.ZdRd/d0ZdSd2d3ZdTd7d8Z	9	dUdV fdBdCZ  ZS )Wr$  z|
    A Tensor layout that we are allowed to change

    Assumption: layout change should NOT add or remove free symbols
    Fr   ri  c                 C  s   t |  S )z
        Compute what the strides would be if this layout were frozen,
        without actually modifying the layout. This is used for speculative
        stride computation during Triton template code generation.
        )r  deepcopyrL  r  r   r   r   !get_fixed_layout_without_freezingT  s   z0FlexibleLayout.get_fixed_layout_without_freezingsizesr   r  c                 C  sN   t | dkrg S tjjg}t| dd  D ]}|||d   qtt|S )Nr   rD   rI  )r   r   r  r  r  r  r   )rV  reversed_stridesr  r   r   r   r%  ^  s   
z!FlexibleLayout.contiguous_stridesr   c                 C  s\   t tt| t |ksJ | |ftjj}dgt| }|D ]}|||< || |  }q|S )z
        Create a stride based on the order the dimensions should be filled in.

        In this format, channels last would be:
            [1, 3, 2, 0]
        N)r=   r   r   r   r  r  )rV  r   next_strider^  r   r   r   r   fill_orderedg  s   $zFlexibleLayout.fill_orderedr  c                 C  s0   t tt| t |ksJ t|}t| |S )z
        Create a stride based on the sorted order of a permuted range.

        In this format, channels last would be:
            [3, 0, 2, 1]
        )r=   r   r   r  r$  rY  )rV  r   r  r   r   r   r3  x  s   zFlexibleLayout.stride_orderedmemory_formattorch.memory_formatc                 C  sP   |t jkrt| tS |t jkrt| tS |t jkr t| S t	
d| t)aq  
        Create a stride based on a memory format.

        Memory format is translasted into a stride order,
        so channels_last is the same as:
            FlexibleLayout.stride_ordered(sizes, [3, 0, 2, 1])

        This interface does not support memory_format `torch.preserve_format`
        which should be used to deduce a format from another source
        z>stride_ordered_for_memory_format, unsuppored memory_format: %s)r)  channels_lastr$  r3  NHWC_STRIDE_ORDERchannels_last_3dNHWDC_STRIDE_ORDERcontiguous_formatr%  r\  r  r  )rV  rZ  r   r   r    stride_ordered_for_memory_format  s   



z/FlexibleLayout.stride_ordered_for_memory_formatr  rU  c                 C  sD   t | t |ks
J tjj|}ttt ||jd}t	| |S )z
        Create a stride that has the same stride order as given stride

        For example, if given stride is [1000, 1, 100, 10],
        the fill order should be [1, 3, 2, 0]
        rB  )
r   rr   r  r  guarding_hints_or_throwrH  r   __getitem__r$  rY  )rV  r  r  r   r   r   same_ordered  s   
zFlexibleLayout.same_orderedc                 C  r  r   r  r  r   r   r   r    r~  zFlexibleLayout.sizer-  r   c                 C     |  d| || _d S )Nr  )!assert_free_symbol_uses_unchangedr  r  r   r   r   r       
c                 C  r  r   r  r  r   r   r   r    r~  zFlexibleLayout.stridec                 C  re  )Nr  )rf  r  r  r   r   r   r    rg  r   c                 C  r  r   r  r  r   r   r   rj    r~  zFlexibleLayout.offsetc                 C  re  )Nrj  )rf  r  r  r   r   r   rj    rg  r*  r   c                 C  sH   |  | j|}|  r|r| || j| j}t| j| j| j|| j| jS r   )	r3  r  rG  rJ  r  ri  r  rj  rk  )r  r   r*  ro  r   r   r   as_stride_order  s   zFlexibleLayout.as_stride_orderr2  c                 C  s>   |}|   r|r| || j| j}t| j| j| j|| j| jS r   )rG  rJ  r  r  ri  r  rj  rk  )r  r2  r*  ro  r   r   r   as_exact_strides  s   zFlexibleLayout.as_exact_stridesc                 C  D   |  | j|}|  r| || j| j}t| j| j| j|| j| jS r   )	rY  r  rG  rJ  r  ri  r  rj  rk  )r  r   ro  r   r   r   as_fill_order     zFlexibleLayout.as_fill_orderc                 C  rj  r   )	rd  r  rG  rJ  r  ri  r  rj  rk  )r  r  ro  r   r   r   as_same_order  rl  zFlexibleLayout.as_same_order$dict[tuple[str, bool], sympy.Symbol]c                 C  s<   i }dD ]}dD ]}||f}t tt| ||||< qq|S )N)r  r  rj  TF)r=   r&   r   )r  initial_free_symbolsr   r  rC  r   r   r   get_initial_free_symbol_uses  s   
z+FlexibleLayout.get_initial_free_symbol_usesr   r   r6   c                 C  sF   dD ]}| j ||f }tt||}||ks J d| d| qd S )Nro  z)Expected free symbols unchanged, but got z vs )rp  r=   r&   )r  r   r-  r  old_free_symbolsnew_free_symbolsr   r   r   rf    s   
z0FlexibleLayout.assert_free_symbol_uses_unchangedNr  r  r  r  r  rM  rN  rk  c                   s@   |r	t ||}nt |}t j|||||d |  | _d S )Nrk  )r$  rY  r%  r  r  rq  rp  )r  r  r  r  rM  rk  r^  r   r   r   r    s
   
zFlexibleLayout.__init__rR  )rV  r   r   r  )rV  r   r   r   r   r  )rV  r   r   r   r   r  )rV  r   rZ  r[  r   r  )rV  r   r  rU  r   r  rS  rP  rT  rQ  r_  )r   r   r*  r   r   ri  )r2  rU  r*  r   r   ri  )r   r   r   ri  )r  rU  r   ri  )r   rn  )r   r   r-  r6   r   r   r  )r  r  r  r  r  r  rM  rN  rk  r   r   r   )r   r   r   rk  r  rU  rn  r%  rY  r3  ra  rd  rq  r  rS  r  rj  rh  ri  rk  rm  rq  rf  r  r  r   r   r   r   r$  K  sJ    





r$  c                      sL   e Zd ZdZd fddZdd	d
ZdddZed 	ddddZ  Z	S )NonOwningLayoutz,Is a view into the storage of another tensorviewBaseView | TensorBoxr   r   c                   s,   |  }t |j|j|j|j || _d S r   )r#  r  r  r  r  r  r  rv  )r  rv  rg  r   r   r   r  ,  s   
zNonOwningLayout.__init__r  c                 C     |    S r   )rL  r  r  r   r   r   r  6  r   zNonOwningLayout.make_indexerr   c                 C  s4   | j  j}|dkrdS ddlm} tjj||S )Nr   TrD   )	ALIGNMENT)	rv  r#  rj  utilsry  rr   r  r  rV  )r  rj  ry  r   r   r   maybe_guard_aligned9  s
   z#NonOwningLayout.maybe_guard_alignedFr  r   c                 C  sV   t | jtsJ | jj}t |tsJ t||j}t |ts%J t||j|S r   )	r   rv  rl  rf  r  r   r  rg  r>  )r  r  boxinput_bufferr   r   r   r>  A  s   z$NonOwningLayout.get_free_symbol_uses)rv  rw  r   r   rZ  rR  r_  rh  )
r   r   r   rk  r  r  r{  r]   r>  r  r   r   r   r   ru  )  s    


ru  c                   @     e Zd ZdZdS )CommBufferTypesymm_memN)r   r   r   SYMM_MEMr   r   r   r   r  M      r  c                      s4   e Zd ZU dZded< ded< d
 fdd	Z  ZS )CommBufferLayoutax  
    A layout that signifies the buffer is a comm buffer.
    In terms of striding, the layout is identical to `FixedLayout`.

    Buffers with this layout do not participate in in-place reuse - it can be
    neither the source nor the target for in-place reuse.

    For detailed motivation and usage of this layout, see
    NOTE [lowering-time collective optimization].
    r  comm_buffer_typer   
group_namerg  FlexibleLayout | FixedLayoutc                   sJ   t |tr	| n|}t j|j|j|j|j|j	|j
d || _|| _d S )Nr  r  r  r  rj  rk  )r   r$  rL  r  r  r  r  r  r  rj  rk  r  r  )r  rg  r  r  fixedr   r   r   r  `  s   
zCommBufferLayout.__init__)rg  r  r  r  r  r   )r   r   r   rk  r   r  r  r   r   r   r   r  Q  s
   
 r  c                   @  sd   e Zd ZU ded< ejdd dZded< ejdd dZded	< dddZdddZ	dddZ
dS )
NoneLayoutr  r  c                   C     dgS r  r   r   r   r   r   r@  ~  rA  zNoneLayout.<lambda>default_factoryr  r  c                   C  r  r  r   r   r   r   r   r@    rA  r  r   r   c                 C  r  r  r   r  r   r   r   r    r  zNoneLayout.storage_sizer6  c                 C     | S r   r   r  r   r   r   rL    r  zNoneLayout.as_fixedc                 C  r  r   r  r  r   r   r   r'    r  zNoneLayout.get_deviceNrf  rQ  rW  )r   r   r   r   rl  rm  r  r  r  rL  r'  r   r   r   r   r  s  s   
 


r  c                      s   e Zd Zd# fddZed$dd	Zejd%dd	Zd&ddZd'ddZd(ddZ	e
	d)d*ddZd+ddZd,d!d"Z  ZS )-MutationLayoutSHOULDREMOVEr  r   r   r   c                   s@   t  | | | d  || _|   }tj	
| d S r   )r  r  r  r&  r!  r  
get_bufferr  rr   r  mark_buffer_mutated)r  r  r   r   r   r   r    s   z#MutationLayoutSHOULDREMOVE.__init__r  c                 C  r8  r   )real_layoutr  r  r   r   r   r    r  z!MutationLayoutSHOULDREMOVE.strider-  r   c                 C  r  r   r   r  r   r   r   r    r  r   c                 C  rx  r   )r  r  r  r   r   r   r    r   z'MutationLayoutSHOULDREMOVE.storage_sizer  c                   s2   d fdd  | j }t|tsJ t||S )Nr  r   r   c                   sB   t | tr
 | jS t | tr |  S t | tr | jS | S r   )r   r  r  r  r  
MutableBoxrf  )r  unwrap_viewsr   r   r    s   




z;MutationLayoutSHOULDREMOVE.get_buffer.<locals>.unwrap_views)r  r   r   r   )r  r   r  r   )r  r  r   r  r   r    s   
	z%MutationLayoutSHOULDREMOVE.get_bufferr  c                 C  s   |   j}t|tsJ |S r   )r  rg  r   r  )r  rg  r   r   r   r    s   
z&MutationLayoutSHOULDREMOVE.real_layoutFsrcdstunsafe_aliasr   c              	   C  s   |   tj|  t|tr|j}|  |sAt	j
| | | dd t| | D d}t|ttfs>J |j}|   t|dsNJ |t|jjts]J t|jjt||j_|jS )Nc                 S      g | ]\}}t jj||qS r   rr   r  r  check_equals_and_simplifyr   r  r  r   r   r   r         z;MutationLayoutSHOULDREMOVE.realize_into.<locals>.<listcomp>r  rf  )r	  rr   r  r  r  r   r   rf  r(  r  r  r'  r&  r  r   r!  r  r  r  rg  r$  r   r  )r  r  r  r  r   r   r   r   realize_into  s*   
	z'MutationLayoutSHOULDREMOVE.realize_intor   c                 C  r  r   r   r  r   r   r   rL    r  z#MutationLayoutSHOULDREMOVE.as_fixedr  c                 C  rd  r   )r  r  r  r   r   r   r    r  z'MutationLayoutSHOULDREMOVE.make_indexer)r  r   r   r   rS  )r-  r   r   r   rT  )r   r  rP  r_  )r  r   r  r   r  r   r   r   )r   r   rZ  )r   r   r   r  rq  r  rS  r  r  r  r  r  rL  r  r  r   r   r   r   r    s    



'r  c                      sl  e Zd ZU ded< ded< d] fddZd^d
dZd_ddZd`ddZdaddZdbddZ	e
dcddZddddZdedd Zdfd"d#Zdgd%d&Zdhd'd(Zdid*d+Zdjd-d.Zd]d/d0Z	1dkdld5d6Zdmd7d8Zdnd:d;Z	1dkdod=d>Zdjd?d@ZdpdBdCZdqdrdGdHZd]dIdJZdsdLdMZdsdNdOZdtdQdRZed 	1dkdudUdVZ dvdWdXZ!dwdYdZZ"djd[d\Z#  Z$S )xr  r3  r   r6  rg  r   r   c                   s   t    | dd  d S rx  )r  r  r  r  r   r   r   r    s   
zBuffer.__post_init__r  c                 C  rx  r   )r#  r  r  r   r   r   r    r   zBuffer.make_indexerr   c                 C  s   | j sJ | | j S r   r   r  r   r   r   r    ry  zBuffer.get_nametorch.Tensor | torch.SymIntc                 C  s&   t | jtr| j S tt| jjr   )r   rg  r  r$  r  r   r   r  r   r   r   r$    s   
zBuffer.get_exampler  c                 C  rx  r   )r  r'  r  r   r   r   r'    r   zBuffer.get_devicer  c                 C  r  r   r   r  r   r   r   r    r  zBuffer.get_defining_opr  c                 C  r8  r   )r#  r  r  r   r   r   r    r  zBuffer.dtyper  c                 C     g |   jS r   )r#  r  r  r   r   r   r!    r   zBuffer.get_sizer  c                 C  r  r   )r#  r  r  r   r   r   rK    r   zBuffer.get_strider   c                 C  r8  r   )r#  rj  r  r   r   r   
get_offset  r  zBuffer.get_offsetr  c                 C  s"   t | jtr	| jS tt| jjr   )r   rg  r  r  r   r   r  r   r   r   r#    s   zBuffer.get_layoutc                 C  r  r   r  r  r   r   r   r    r  zBuffer.get_output_specr   c                 C  r  r   )r  r  r   r   r   r<    r  zBuffer.get_storage_numelr   c                 C  r8  r   )r#  rk  r  r   r   r   get_is_pinned  r  zBuffer.get_is_pinnedc                 C  s0   t | jtrt | jts| j | _d S d S d S r   )r   rg  r  ru  rL  r  r   r   r   r)    s
   zBuffer.freeze_layoutFr   r   r*  c                 C  0   t | jtsJ t| j| jj||d| _d S NrQ  )r   rg  r$  r   rh  r+  r   r   r   r,  %  s   z&Buffer.freeze_layout_with_stride_orderc                 C  ,   t | jtsJ t| j| j|| _d S r   )r   rg  r$  r   rk  r.  r   r   r   r/  +     z$Buffer.freeze_layout_with_fill_orderr  c                 C  r  r   )r   rg  r$  r   rm  r0  r   r   r   r1  /  r  z$Buffer.freeze_layout_with_same_orderr2  c                 C  r  r  )r   rg  r$  r   ri  r3  r   r   r   r4  3  s   z'Buffer.freeze_layout_with_exact_stridesc                 C  r  r  r  r  r   r   r   r  ;  r  zBuffer.is_zero_elementsr  c                   s*      rtt  dS d fdd}|S )	Nr  r   r  r   rq   c                   s      }t jp
d|| S r  )r  rp   r  r   r   r  r  r   r   r  C  s   z"Buffer.make_loader.<locals>.loaderr  )r  r   r  r&  rc  r   r  r   r  >  s   zBuffer.make_loaderNr
  r  c                 C  r  r   r  r  r   r   r   r  I  r  zBuffer.codegen_referencec                 C  r  r   r   r  r   r   r   rJ  L  r  zBuffer.decide_layoutr  c                 C     t | jtr| jj gS dS r  )r   rg  ru  rv  r  r  r   r   r   rF  O     z#Buffer.get_inputs_that_alias_outputc                 C  r  r  )r   rg  r  r  r  r  r   r   r   rD  T  r  zBuffer.get_mutation_namesr  c                 C  s   t |  gS r   )r=   r  r  r   r   r   r  Y  r   zBuffer.get_read_namesr  r   c                 C  r  r   r<   r=  r   r   r   r>  \     zBuffer.get_free_symbol_usesc                 C  r  r   r<   r  r   r   r   r  b  r  zBuffer.get_unbacked_symbol_defsc                 C  r  r   r   r  r   r   r   r	  e  r  zBuffer.realizec                 C  r  r  r   r  r   r   r   should_allocateh  r  zBuffer.should_allocaterI  rZ  r\  )r   r  rW  rL  rO  rS  )r   r  rT  rP  rQ  rf  rR  r_  r`  ra  )r  r   r   r   )r2  r   r*  r   r   r   rY  r   rV  rj  rJ  rh  r  rU  )%r   r   r   r   r  r  r  r$  r'  r  rq  r  r!  rK  r  r#  r  r<  r  r)  r,  r/  r1  r4  r  r  r  rJ  rF  rD  r  r]   r>  r  r	  r  r  r   r   r   r   r    sL   
 






















r  c                   @  s0   e Zd ZdddZdddZejZdd	d
ZdS )OperationBufferr   r  c                 C  s   | gS r   r   r  r   r   r   r  p  r  zOperationBuffer.get_outputsrr  c                 C  r  r   r   r  r   r   r   r  s  r  zOperationBuffer.get_defining_opr   c                 C  s   t |  t|  d S r   )r  r  rr  r  r   r   r   r  y     
zOperationBuffer.__post_init__Nr  r   rr  rI  )r   r   r   r  r  rr  rE  r  r   r   r   r   r  m  s
    

r  c                   @     e Zd ZdddZdS )r  r   r   c                 C  r  r8  r   r  r   r   r   r;    r  zInputBuffer.num_readsNrf  )r   r   r   r;  r   r   r   r   r  ~      r  c                   @  r~  )DonatedBufferaY  
    Represents a donated buffer which is a saved tensor that is not alias to any
    fwd inputs, fwd user outputs, and bwd outputs. We generally cannot inplace
    reuse the input tensor memory during backward since it might be used in another
    function. However, donated buffer can be inplace reused during backward
    to save memory.
    N)r   r   r   rk  r   r   r   r   r    r  r  c                   @  s.   e Zd ZU dZded< dddZdddZdS )r  Nr  r  r   r  c                   r  )Nr   r  r   rq   c                   s,       }ttj   j|| S r   )	r#  r  rp   r  rr   r  constant_namer  r  r  r  r   r   r    s
   z*ConstantBuffer.make_loader.<locals>.loaderr  r   rc  r   r  r   r    s   zConstantBuffer.make_loaderr  r  r   c                 C  s   t tj|  || jdS Nr   rg  )r  rr   r  r  r  rg  r  r   r   r   rC    s   z!ConstantBuffer.constant_to_devicerY  ri  )r   r   r   r  r   r  rC  r   r   r   r   r    s   
 

r  c                   @  sL   e Zd ZdddZed 	ddd	d
ZddddZdddZdddZdS )NoneAsConstantBufferr   r7  c                 C  r  r   r<   r  r   r   r   r    r  zNoneAsConstantBuffer.get_readsFr  r   r   c                 C  r  r   r<   r=  r   r   r   r>    r  z)NoneAsConstantBuffer.get_free_symbol_usesNr
  r  r   c                 C  s
   t jjjS r   )rr   r  r  none_strr  r   r   r   r    r  z&NoneAsConstantBuffer.codegen_referencer6  c                 C  s
   t d dS Nr  )r  r  r   r   r   r    r  z$NoneAsConstantBuffer.get_output_specc                 C  r  r  r   r  r   r   r   r    r  z&NoneAsConstantBuffer.has_tensor_outputre  r_  rh  r   rV  rQ  rR  )	r   r   r   r  r]   r>  r  r  r  r   r   r   r   r    s    

r  c                   @  sB   e Zd ZU ded< ed 	dddd	ZddddZdddZd
S )r   r   r  Fr  r   r   r   c                 C     t | j|S r   )r&   r  r=  r   r   r   r>       z*ShapeAsConstantBuffer.get_free_symbol_usesNr
  r  r   c                 C  s   t jj| jS r   )rr   r  r  codegen_sizevarr  r  r   r   r   r    r  z'ShapeAsConstantBuffer.codegen_referencec                 C  r  r  r   r  r   r   r   r    r  z'ShapeAsConstantBuffer.has_tensor_outputr_  rh  r   rV  rR  )r   r   r   r   r]   r>  r  r  r   r   r   r   r     s   
 r   c                      sh  e Zd ZU dZded< dZded< dZded	< dZd
ed< dZded< dZ	ded< e
jd[ddZee
jd[ddZd\ddZd]ddZd^ddZd_dd Zd`d"d#Zed 	dadbd'd(Zdc fd*d+Zddd,d-Zded/d0Zdfd2d3Zdgd5d6Zedhd8d9Z		didjd>d?Ze	dkdldIdJZdmdLdMZdmdNdOZ d\dPdQZ!dddRdSZ"dddTdUZ#dndYdZZ$  Z%S )or  zb
    Represents a buffer that is computed during kernel execution rather than being an input.
    r  rf  FzClassVar[bool]_force_realizeNr  r  Callable[..., Any] | Noner  r  r  r  r   Iterator[None]c              
   c  s    | j d usJ | jd usJ | jd usJ | jd usJ t| jts+J t| j | j}| j}z3t|j	|j
| j| j| j|j|j|jd}|| _t|j	|j
| j| _| j|  d V  W || _|| _d S || _|| _w )Nr+  )r  r  r  r  r   rf  r  r   rg  r  r  r  r  r  ri  get_default_sizes_bodyclear_cache)r  old_datarn  new_datar   r   r   with_original_inner_fn  s>   

z%ComputedBuffer.with_original_inner_fnc                  c  s*    t j} zdt _d V  W | t _d S | t _w NT)r  r  )	old_valuer   r   r   force_realize  s   zComputedBuffer.force_realizer3  c                 C  s(   | j dur| j S t| jdr| jj S dS )z
        Returns self.name if it exists, otherwise returns the name of the data node if that exists.
        If neither exist, returns None.
        Nr   )r   r  rf  r  r   r   r   get_computed_buffer_name  s
   
z'ComputedBuffer.get_computed_buffer_namer   c                 C  rd  r   rf  r;  r  r   r   r   r;    r  zComputedBuffer.num_readsr7  c                 C  rd  r   rf  r  r  r   r   r   r    r  zComputedBuffer.get_readsr  c                 C  rd  r   ro  r  r   r   r   r    r  zComputedBuffer.get_read_namesr5  c                 C  s   t | jttttfstjt t t dS t	
tdd, | j r7t|  | j | j W  d    S t|  | j W  d    S 1 sKw   Y  d S )Nr:  writesindex_exprsr  T)r   rf  r  r  r7  r  rF   
ReadWritesr=   r   r   r$  r?  rO   get_store_functionr  r@  r!  r  r   r   r   r6    s&   
$zComputedBuffer.get_read_writesr  r   r   c                 C  s6   | j || j|B }|  r||  |O }|S r   )rg  r>  rf  has_store_functionr6  )r  r  r  r   r   r   r>  +  s   
z#ComputedBuffer.get_free_symbol_usesr  c                   s<   |   s| jtjjvr|  dkr| js| j S t	  S r  )
r?  r   rr   r  mutated_buffersr;  r  rf  r  r  r  r   r   r   r  E  s   

zComputedBuffer.make_loaderc                 C  s   t | jttttfS r   )r   rf  r  r  r7  r  r  r   r   r   r  P  r   z!ComputedBuffer.has_store_functionCallable[..., None]c                 C  s`   |     }t| jtttfrt| jj	| j
|S t| jts'J t| jt| jj| j
|S r   )r#  rL  r  r   rf  r  r  r7  r   r%  r   r  r   r  ra  r   r   r   r  S  s
   z!ComputedBuffer.get_store_functionlist[int] | Nonec                   s   t | jtrYt| j | j \\}}|  j	}t
dd |D s&J fdd|D }|rYt | jttfrA| j| n|  fdd|D }ddlm} |||  S dS )	al  
        If our layout is still flexible, try to determine the stride order based on stride orders of reads.

        TODO(jansel): A better algorithm here would look at downstream consumers of this
                      value and try to do global graph-level layout optimization.
                      This is also something just begging to be autotuned.
        c                 s  s"    | ]}t |tjtjfV  qd S r   )r   rF   StarDep	MemoryDepr?  r   r   r   r   j  
    
z0ComputedBuffer.get_fill_order.<locals>.<genexpr>c                   s.   g | ]}t |tjrt|jd d  D qS )c                 S  s   i | ]}|d kr|t jjqS r  r  r   vr   r   r   r   o      z<ComputedBuffer.get_fill_order.<locals>.<listcomp>.<dictcomp>)r   rF   r  rn   r   r?  )r"  r   r   r   n  s    
z1ComputedBuffer.get_fill_order.<locals>.<listcomp>c                   s   g | ]
}t jj| qS r   rr   r  r  r_  r   r  )rM  r   r   r   y  s    rD   pick_loop_orderN)r   rg  r$  rF   r]  rf  r  r@  r6  r:  r   r  r7  r   	schedulerr  r!  )r  
index_varsr
  r:  stride_lengthsr  r   )rM  r"  r   r   [  s*   


zComputedBuffer.get_fill_orderr   c                 C  s6   t | jtr|  }|r| | d S |   d S d S r   )r   rg  r$  r   r/  r)  r.  r   r   r   rJ    s   zComputedBuffer.decide_layoutMtuple[tuple[list[Expr], list[Expr]], LoopBody, tuple[list[Expr], list[Expr]]]c           
      C  s   t j|  |  dd\}}ttd|   t| 	 | 
 r"|n|d d |g|R  }W d    n1 s8w   Y  g }g }g }g }| D ]+\}}	||d v rb|rWJ || ||	 qI||d v sjJ || ||	 qI||f|||ffS )Nqr   r  rD   r   )rF   r]  r  r@  r   r   r  r'  rR   r  r?  itemsr  )
r  r   
var_rangesr  r  reduce_vars
index_sizereduce_sizer  r   r   r   r   r    s2   



z%ComputedBuffer.get_default_sizes_bodyextra_indexing_constraints'tuple[dict[Any, Any], list[Any]] | Nonerecompute_sizes_body_func5tuple[tuple[list[Expr], list[Expr]], LoopBody | None]c                   s    \\}}}\}}|r|||f|||f\\}}}\}}g |j  |durut|tr4t|dks6J |\}}	t|tsEJ t|t|	tsPJ t|	t	dd |	D s[J |j
}
|
|kshJ |
|f fdd|	D }	 |	7  g | tjtjs|  d fdd}|| }tt ptj }|||||\}}}|||||\}}}tj||dd\\}}}t|||||g|||}||f|fS )an  
        This is a main place where we do loop transformations in a
        backend-agnostic way.

        Here we:
            1) Remove any 1 dimensions
            2) Fuse contiguous dimensions together
            3) Reorder dimensions based on stride orders

        Optional argument extra_indexing_constraints can be used to append additional
        indexing expressions to existing ones derived from buffer's body. This can be useful
        to fuse scheduler nodes with compatible ranges, e.g. (s0*s1*...,) and (s0, s1, s2, ...)
        on CPU by preventing indexing simplifications and obtaining index/reduce ranges for
        the scheduler node compatible with other nodes.
        Optional argument recompute_sizes_body_func can be used to recompute sizes and body
        on the default body. This can be useful to append additional loop transformations.
        Nr   c                 s      | ]}t |tV  qd S r   )r   r   )r   fr   r   r   r     r  z6ComputedBuffer.simplify_and_reorder.<locals>.<genexpr>c                   s   g | ]}| vr|qS r   r   r  )index_formulasr   r   r         z7ComputedBuffer.simplify_and_reorder.<locals>.<listcomp>x_varsSequence[sympy.Symbol]support_varsrV  r   simplify_loopsr   r   dtuple[list[int], Callable[[Sequence[int]], Sequence[int]], Callable[[Sequence[int]], Sequence[int]]]c                   s    | | \}}} dkr8t dkr8ttt }||d dkr8 fdd|D }t|}t|}|| } |rTtjj	
| |t| |\}}}	t||}
n|}
||
|fS )Nr  r   r   c                   r   r   r   r   rV  r   r   r     r  zUComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorder.<locals>.<listcomp>)_apply_loop_reorderingr?  r   r   r   r   r   rr   r  r  _simplify_loopsrJ   r   )r  r  rV  r  newsizesreindex0r   r   r   _pruner   r  memory_addrsr  r  r   simplify_and_reorder  s&   




zAComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorderpr   )
r  r  r  r  rV  r   r  r   r   r  )r  indexing_exprsr   r   r   r   r   r   r   r   r  get_write_exprsrr   r  rS  rG   PREFER_STORE_LOOP_ORDERextendget_read_exprsri   r7  rE   loop_ordering_after_fusionrF   index_vars_no_squeezerR   )r  r  r  r  r  r  r  r  extra_indexing_rangesextra_indexing_exprexpected_var_rangesr  r  should_merge_loopsiter_rangesiter_reindexr
  reduce_rangesreduce_reindex	iter_varsr  r   r  r   r    sx   

6

z#ComputedBuffer.simplify_and_reorderr  r  r  rV  r   r  list[sympy.Expr]priority_idxr  c              
     s   ddl m} |du rg }z* fdd|D }t|t|kr)t|d t ks+J tt|||}W n  tyV   tjrLt	dt
t | ttt}Y nw fdd|D t|t|fS )	zU
        Shuffle the order of loops around to hopefully improve performance.
        rD   r  Nc                   s   g | ]}t jj| qS r   r  r  )r  r  r   r   r   W      z9ComputedBuffer._apply_loop_reordering.<locals>.<listcomp>r   z%Did not simplify complex index:
%s
%sc                   r   r   r   r   r  r   r   r   g  r  )r  r  r   r   r  	ExceptionrE   r  r\  warningr   r   r   r   r   )r  r  rV  r  r  r  r^  r   r   )r  rV  r  r   r  B  s,   
z%ComputedBuffer._apply_loop_reorderingr  c                 C  rd  r   )rf  r  r  r   r   r   r  j  r  z!ComputedBuffer.get_pointwise_sizec                 C  rd  r   rf  r@  r  r   r   r   r@  m  r  z!ComputedBuffer.get_reduction_sizec                 C  rd  r   rf  r?  r  r   r   r   r?  p  r  z!ComputedBuffer.get_reduction_typec                 C  rd  r   )rf  r  r  r   r   r   rB  s  r  zComputedBuffer.is_no_opc                 C  r  r  r   r  r   r   r   r  v  r  zComputedBuffer.should_allocater  r  r   c                 C  rY  )r  rf  rC  r  r   r   r   rC  y  r[  z!ComputedBuffer.constant_to_device)r   r  rU  rf  re  rJ  rd  r_  rh  rY  rR  )r   r  )r   r  rI  )r   r  NN)r  r  r  r  r   r  r   )r  r  r  r  rV  r   r  r  r  r  r   r  rS  ri  )&r   r   r   rk  r   r  r  r  r  r  ro  rp  r  rn  r  r  r;  r  r  r6  r]   r>  r  r  r  r   rJ  r\   r  r  r  r  r@  r?  rB  r  rC  r  r   r   r   r   r    sR   
 "








'" 
'



r  c                   @  s2   e Zd ZU dZded< ded< ded< ded< dS )	FinalizeCodegenResultzNStructured result from TemplateBuffer._finalize_codegen for external backends.r   sourcer   importscall_preamble	call_argsN)r   r   r   rk  r   r   r   r   r   r  ~  s   
 r  c                      s   e Zd ZdZ			dKdL fddZedMddZdNddZdOddZdPdQdd Z	dRd"d#Z
dSd%d&ZdTd'd(Z		dUdVd-d.ZdTd/d0ZdWd2d3ZdXd7d8ZedYd<d=Zedddd>dZdIdJZ  ZS )[r  z
    Base class for template operators that support epilogue and prologue fusion.
    Subclasses: TritonTemplateBuffer (built-in Triton templates),
    HelionTemplateBuffer (Helion kernels), etc.
    Nrg  r6  r~  r  make_kernel_renderr  mutated_inputsIterable[IRNode] | Noneallowed_prologue_inpsOrderedSet[str] | Nonenamed_inputsdict[str, IRNode] | Noner   r   c                   s   t  jd |d t|_|_tj_	tj
 i _i _i _|r-t|ni _|_g _|d urYjd }t|tsJJ t||   fdd|D _|p]t _d _d _d S )Nr  r   c                      g | ]}t t d |qS r  MutationOutputr  r   rO  r  r  r   r   r     r  z+TemplateBuffer.__init__.<locals>.<listcomp>)r  r  r  unwrap_storager~  r  rr   r  register_bufferr   register_operationr   epilogue_fusable_outputs_multi_output_childrenr   _named_inputsr  mutation_outputsr   r   r   r'  r=   r  allow_epilogue_fusionallow_prologue_fusion)r  rg  r~  r  r  r  r   first_inputr   r'  r   r    s.   	

zTemplateBuffer.__init__r  c                 C  s   t | jtr
td|  jS )Nz1Multi-output templates do not have a single dtype)r   rg  MultiOutputLayoutr  r#  r  r  r   r   r   r    s
   
zTemplateBuffer.dtyper5  c                 C  s   | j ddS )NT	normalize)rO   r  r   r   r   r6    r   zTemplateBuffer.get_read_writesr4  r   OrderedSet[dependencies.Dep]c                   s   t  }| jD ]9}t|ttfsJ t|| t jts$J t j j d fdd}|t	j
|  d|d	jO }q|S )z(Build read dependencies from all inputs.r   Sequence[Any]r*  r   r   c                   s$   t |dksJ t  | S r  )r   rp   r  r  r  r  inp_indexerr   r   dummy  s   z4TemplateBuffer._read_deps_from_inputs.<locals>.dummyr   r3  Nr   r6  r*  r6  r   r   )r=   r~  r   rl  r  r   rg  r  r  rF   rO   r!  r:  )r  r4  r:  inp_rawr9  r   r7  r   _read_deps_from_inputs  s   

z%TemplateBuffer._read_deps_from_inputsFc                   s   t | jtr%ttj|  tddddg}tj	| 
||t dddS |  |    d fdd}tj||  d|d}| j| 
|O  _|S )a  Extract read/write dependencies for this TemplateBuffer.

        When the layout is MultiOutputLayout (multi-output templates), the
        buffer itself has no data layout, so we cannot build an indexer.
        Instead, synthesize a trivial write dep and derive read deps from
        the named tensor inputs (``_named_inputs``).  For single-output
        templates with a concrete layout, fall through to the standard path.
        r   r   )	var_namesr  N)r:  r  r  rG  r  r   r6  r*  r   r   c                   "   t |dksJ t | dS Nr   faker   rp   r  r  r  r   r   r   r9       z1TemplateBuffer.extract_read_writes.<locals>.dummyr3  r:  )r   rg  r2  r=   rF   r  r  r   r   r  r<  r#  r  rO   r!  r:  )r  r4  r  r9  depsr   rB  r   rO     s,   	z"TemplateBuffer.extract_read_writesr  c                 C  s   t jjS r   )r   r  r  r  r   r   r   r@    r  z!TemplateBuffer.get_reduction_sizer3  c                 C  r  r   r   r  r   r   r   r?    r  z!TemplateBuffer.get_reduction_typec                 C  r  r  r   r  r   r   r   r  
  r  zTemplateBuffer.should_allocater  r  r  9tuple[tuple[Sequence[Expr], list[Expr]], LoopBody | None]c                 C  s   |   g fd fS r   r  )r  r  r  r   r   r   r    s
   z#TemplateBuffer.simplify_and_reorderc                 C  s   t | jtS )zFWhether this template produces multiple outputs via MultiOutputLayout.)r   rg  r2  r  r   r   r   is_multi_outputs_template  r[  z(TemplateBuffer.is_multi_outputs_templater  c                 C  r  r   )r  r  r   r   r   get_allowed_prologue_inps  r  z(TemplateBuffer.get_allowed_prologue_inpshook_outputsdict[str, str]FinalizeCodegenResult | Nonec                 C  r  )a  Called after epilogue/prologue subgraph codegen with rendered hook outputs.

        ``hook_outputs`` maps placeholder keys (e.g. ``<STORE_OUTPUT_0>``,
        ``<LOAD_INPUT_x>``) to the Triton code generated by Inductor for
        each fused subgraph.

        Return a ``FinalizeCodegenResult`` to provide custom source code and
        call metadata, or ``None`` to use the default codegen path.
        Nr   )r  rH  r   r   r   _finalize_codegen!  s   z TemplateBuffer._finalize_codegenr  r   r   c                 C  sN   t |trt |jtr|jS t|}t |tr|j}t |jtr%|	  |S )zWRealize a TensorBox, preserving MultiOutput layout (unlike ExternKernel.realize_input).)
r   r   rf  MultiOutputr  realize_inputr  rg  r$  r)  )r  r  r  r   r   r   realize_template_input/  s   

z%TemplateBuffer.realize_template_input)direct_alias_at_leafon_tensor_leafon_non_tensor_leaftemplate_buf
structuredr   rO  dict[int, IRNode] | NonerP  FCallable[[str, MultiOutput, list[tuple[type, int]], int], None] | NonerQ  Callable[[int], None] | Nonetuple[TensorBox, ...]c                  s4   i t  d
 fddt|g S )zLWalk a structured output tree, creating MultiOutput nodes for tensor leaves.rt  r   rM  list[tuple[type, int]]r   list[TensorBox]c           	        s   t | ttfr$g }t| D ]\}}||g |t| |f q|S t}t | tjrq r<| v r<t	
 | gS t| }|v rI| gS tt| |}|j| < d urf| ||| t	|}||< |gS d ury| g S r   )r   r   r   r  r   r   r  r)  Tensorr   r  idrL  FallbackKerneltensor_to_layoutr,  r  )	rt  rM  r  r   itemleaf_idxtidmor  rO  leaf_counterrQ  rP  seen_outputsrR  walkr   r   re  J  s0   "
z0TemplateBuffer.build_multi_outputs.<locals>.walkN)rt  r   rM  rX  r   rY  )rv  countr   )r  rR  rS  rO  rP  rQ  r   rb  r   build_multi_outputs;  s   z"TemplateBuffer.build_multi_outputs)NNN)rg  r6  r~  r  r  r  r  r  r  r  r   r!  r   r   rO  rd  )r4  r   r   r5  r_  )r4  r   r   r5  rS  rU  rR  r  )r  r  r  r  r   rE  rJ  )rH  rI  r   rJ  )r  r   r   r   )rR  r  rS  r   rO  rT  rP  rU  rQ  rV  r   rW  )r   r   r   rk  r  rq  r  r6  r<  rO   r@  r?  r  r  rF  rG  rK  r  rN  rg  r  r   r   r   r   r    s6    2


&




r  c                      sR   e Zd Z		dd fddZed 	dd fddZd ddZd!ddZ  ZS )"TritonTemplateBufferNrg  r  r~  r  r  Callable[_P, _T] | Noner  r  r  r  r   r   c                   sB   t  j|||||d | jdusJ | j| ji| _d| _d| _dS )a  
        NOTE:[TritonTemplates with multiple outputs]
        We want the ability for TritonTemplates to output multiple tensors. Triton
        kernels have no notion of outputs and this is done by creating tensors that
        are then mutated by the kernel. Currently our STORE_OUTPUT codegen doesn't
        support creating multinode outputs for triton templates.
        We work around this by creating an extra input buffer during the lowering
        and we mark them as mutated inputs.
        )r  r  N)r  r  r   r+  subgraph_inpssubgraph_outs)r  rg  r~  r  r  r  r   r   r   r  i  s   
zTritonTemplateBuffer.__init__Fr  r   r   c                   s   t  |}| jr| jng }| jr| jng }|D ]%}t|tjr)|t|| qt|t	r7||| q|d u s=J q|D ]}t|t	rP||| q@|d u sVJ q@|S r   )
r  r>  rk  rj  r   r   r   updater&   r   )r  r  resrk  rj  r  r  r   r   r   r>    s   

z)TritonTemplateBuffer.get_free_symbol_usesr  c                 C     | g| j S r   r.  r  r   r   r   r    r   z TritonTemplateBuffer.get_outputsr   c                 C  s   d| j  d}|S )NzTritonTemplateBuffer(layout=r5  r  )r  r  r   r   r   r    s   zTritonTemplateBuffer.__str__r  )rg  r  r~  r  r  ri  r  r  r  r  r   r   r_  rh  r  r\  )	r   r   r   r  r]   r>  r  r  r  r   r   r   r   rh  h  s    
rh  c                      s|   e Zd ZdZd' fddZd(ddZd)ddZd*ddZd)ddZd)ddZ	d+ddZ
d,d!d"Zd)d#d$Zd-d%d&Z  ZS ).ChoiceCallera1  
    Represents a possible choice used in autotune_process.py.
    During autotuning, self.benchmark() is first called to get benchmark result,
    and if this choice is selected, self.output_node() is called to get the output_node.

    Children classes: TritonTemplateCaller, CUTLASSTemplateCaller.
    r   r   r   r  rg  r  descriptionr   r   c                   sP   t    || _|| _|| _|| _d| _d| _i | _d | _	d | _
i | _i | _d S r  )r  r  r   rg  r   rq  failed_benchmark_with_cudagraphsr   rr  decompositiondecomposition_kwargsconfig_patches)r  r   r   rg  rq  r   r   r   r    s   

zChoiceCaller.__init__r   r   r  r  r  c                  sP   |    | jrt fddS tjrt fddS tj d|id dS )Nc                          S r   r   r   algor   r   r   r@        z(ChoiceCaller.benchmark.<locals>.<lambda>c                     rw  r   r   r   rx  r   r   r@    rz  r  r  )to_callablers  rW   benchmark_gpu_with_cuda_graphrE   /profile_bandwidth_with_do_bench_using_profilingrb   	benchmark)r  r  r   r   rx  r   r~    s   zChoiceCaller.benchmarkc                 C  ru  r   rv  r  r   r   r   	call_name  r  zChoiceCaller.call_namer  c                 C  ru  r   rv  r  r   r   r   r{    r  zChoiceCaller.to_callablec                 C  r  )z
        Hash key for the underlying kernel. By default, we assume there are no
        runtime params, so kernel hash key defaults to choice caller's hash key.
        )hash_keyr  r   r   r   kernel_hash_key  s   zChoiceCaller.kernel_hash_keyc                 C  ru  r   rv  r  r   r   r   r    r  zChoiceCaller.hash_keyr   c                 C  ru  r   rv  r  r   r   r   r|    r  zChoiceCaller.output_node6dict[str, PrimitiveInfoType | list[PrimitiveInfoType]]c                 C  s   i S )zRInformation returned here is logged to the autotune log file when that is enabled.r   r  r   r   r   	info_dict  r  zChoiceCaller.info_dictc                 C  r  )Nunsupported_choicer   r  r   r   r   autoheuristic_id  r  zChoiceCaller.autoheuristic_idc                 C  s
   d| _ dS )z
        Mark the choice as failed so that it can be
        removed later. Useful for when we decouple
        compilation and tuning.
        TN)rr  r  r   r   r   mark_failed  s   
zChoiceCaller.mark_failed)
r   r   r   r  rg  r  rq  r   r   r   )r   r   r  r  r   r  r\  )r   r  )r   r   )r   r  rI  )r   r   r   rk  r  r~  r  r{  r  r  r|  r  r  r  r  r   r   r   r   rp    s    







rp  c                   @  r  )TritonTemplateCallerBaser   r   c                 C  ru  r   rv  r  r   r   r   get_make_kernel_render  r  z/TritonTemplateCallerBase.get_make_kernel_renderN)r   r   )r   r   r   r  r   r   r   r   r    r  r  c                      s~   e Zd ZdZd) fddZed*ddZed+ddZ	d,d-ddZe	j
d.ddZd/d d!Z	d,d0d#d$Zd1d'd(Z  ZS )2MultiTemplateBufferaG  
    Represents a Buffer with multiple backing implementation choices.

    Choices can be TritonTemplates or ExternKernels. During scheduling if there is a potential
    epilogue we will benchmark each of the choices with the epilogue to determine an implementation.
    Otherwise, the fastest base choice will be chosen.
    rg  r  r~  r  choice_timings_fn1Callable[[int | None], dict[ChoiceCaller, float]]unfiltered_choiceslist[ChoiceCaller]r  r  r   r   c                   sJ   t  j||d |d || _i | _|| _|| _tdd |D | _i | _d S )N)rg  r~  r  r  c                 s  s0    | ]}t |tpt |tjjjo|jV  qd S r   )r   r  r)  r  select_algorithmExternKernelCallerhas_out_variant)r   choicer   r   r   r     s    

z/MultiTemplateBuffer.__init__.<locals>.<genexpr>)	r  r  _choice_timings_fn_choice_timings_choicesoriginal_inputsr   _output_plannable_make_kernel_renders)r  rg  r~  r  r  r  r   r   r   r    s   

zMultiTemplateBuffer.__init__r   c                 C  r  )z^
        Are all possible choices TritonTemplates or Extern Kernels with out variants
        )r  r  r   r   r   output_plannable#  s   z$MultiTemplateBuffer.output_plannablec                 C  r  r   )r  r  r   r   r   rX  *  r~  zMultiTemplateBuffer.choicesNhint_overrider  dict[ChoiceCaller, float]c                 C  s$   || j vr| || j |< | j | S r   )r  r  )r  r  r   r   r   choice_timings.  s   

z"MultiTemplateBuffer.choice_timingscallerr  r  c                 c  sZ    t |tjjjsJ t|| j|jksJ | j}| | _z	d V  W || _d S || _w r   )	r   r)  r  r  TritonTemplateCallerr   rg  r  r  )r  r  renderr   r   r   swap_as_triton_caller5  s   

z)MultiTemplateBuffer.swap_as_triton_callerc                 C  sR   t |tjjjsJ t||  |jjksJ | 	 |jj
ks"J | | _d S r   )r   r)  r  r  r  r   r!  rg  r  rK  r  r  r  )r  r  r   r   r   finalize_as_triton_callerC  s   
z-MultiTemplateBuffer.finalize_as_triton_callertuple[ChoiceCaller, float]c                 C  s&   | j |d}t||jd}||| fS )N)r  rB  )r  r  r  )r  r  timings
min_choicer   r   r   get_min_choiceK  s   z"MultiTemplateBuffer.get_min_choicecallers*dict[int | None, TritonTemplateCallerBase]c                 C  s0   |  D ]\}}| | j|< q| jd | _dS )z;Finalize with multiple callers for different hint overridesN)r  r  r  r  )r  r  r  r  r   r   r   finalize_as_triton_callersR  s   z.MultiTemplateBuffer.finalize_as_triton_callers)rg  r  r~  r  r  r  r  r  r  r  r   r   rR  )r   r  r   )r  r  r   r  )r  r  r   r  )r  r  r   r   )r  r  r   r  )r  r  r   r   )r   r   r   rk  r  rq  r  rX  r  ro  rp  r  r  r  r  r  r   r   r   r   r    s    
	r  c                      s2   e Zd Zd fddZdddZdddZ  ZS )CUTLASSTemplateBufferrg  r  r~  r  r  Callable[_P, _T]workspace_sizer   templaterv   supports_epilogue_fusionr   r   r   c                   s&   t  ||| || _|| _|| _d S r   )r  r  r  r  r  )r  rg  r~  r  r  r  r  r   r   r   r  ^  s   	
zCUTLASSTemplateBuffer.__init__c                 C  s   | j d ur| j S dS r  r  r  r   r   r   r  m  r   z(CUTLASSTemplateBuffer.get_workspace_sizec                 C  s$   |   D ]}t| d d  qd S r   )r  rp   r  r  )r  rt  r   r   r   emulate_store_fnp  s   z&CUTLASSTemplateBuffer.emulate_store_fn)rg  r  r~  r  r  r  r  r   r  rv   r  r   r   r   rf  rI  )r   r   r   r  r  r  r  r   r   r   r   r  ]  s    
r  c                      s,   e Zd Zd fddZd fddZ  ZS )CppTemplateBufferrg  r  r~  r  r  r  r  rv   r  r   r   r   c                   s&   t  ||| || _|| _d | _d S r   )r  r  r  r  outputs)r  rg  r~  r  r  r  r   r   r   r  v  s   
zCppTemplateBuffer.__init__c                   sp   t | jtr3t | jtsJ t| j| jd }t |ts#J t||j}t |ts1J t||S t 	 S r  )
r   rg  r2  r  r   r   r  r  r  r#  )r  first_outputrg  r   r   r   r#    s   

zCppTemplateBuffer.get_layout)rg  r  r~  r  r  r  r  rv   r  r   r   r   rP  )r   r   r   r  r#  r  r   r   r   r   r  u  s    r  c                      s0   e Zd ZdZ	dd fddZdddZ  ZS )CuteDSLTemplateBufferz
    Buffer for CuteDSL (CUTLASS Python DSL) template kernels.
    Similar to other template buffers but specialized for CuteDSL operations.
    Nrg  r  r~  r  r  r  r  r   r  r  r   r   c                   s   t  ||| |_|_g_|d ur?tjd ts'J tjd jd 	   j fdd|D 7  _d S d S )Nr   c                   r"  r#  r$  r&  r'  r   r   r     r  z2CuteDSLTemplateBuffer.__init__.<locals>.<listcomp>)
r  r  r  r  r  r   r~  r   r   r'  )r  rg  r~  r  r  r  r   r'  r   r    s   "zCuteDSLTemplateBuffer.__init__r  c                 C  r  r   r  r  r   r   r   r    r  z!CuteDSLTemplateBuffer.get_outputsr   )rg  r  r~  r  r  r  r  r   r  r  r   r   r  )r   r   r   rk  r  r  r  r   r   r   r   r    s
    r  c                      sP   e Zd ZdZ					d"d# fddZd$ddZd%ddZ	d&d'd d!Z  ZS )(NVUniversalGemmBufferz
    Buffer for NVIDIA Universal GEMM kernels.

    Unlike CuteDSL templates which use Jinja templates, this generates
    simpler Python code that directly calls the cutlass_api library.
    r   Nrg  r  r~  r  kernelr   accumulator_typevariantr  r   scale_type_a
Any | Nonescale_type_bswizzle_type_aswizzle_type_br   r   c                   sj   t  j||d d || _|| _| g| _|| _|| _|| _|| _|	| _	|
| _
|jj|jjd| _| j| _d S )N)r  )kernel_namemin_cc)r  r  r  r  r  r  r  r  r  r  r  metadatar  r  kernel_metadata_make_kernel_renderr  )r  rg  r~  r  r  r  r  r  r  r  r  r   r   r   r    s   zNVUniversalGemmBuffer.__init__c                 C  r  )z#Return the workspace size in bytes.r  r  r   r   r   r    r~  z(NVUniversalGemmBuffer.get_workspace_sizer  c                 C  r  r   r  r  r   r   r   r    r  z!NVUniversalGemmBuffer.get_outputsout_noder  r  tuple[Any, Any]c           	        s   ddl m} ddlm} g }| jD ]}t|tr|j}t|tr#|j}|	| qt
|j}||||| j| j| j| j| j| j| j| jd  fdd} |fS )z
        Create a kernel renderer for code generation.

        Returns (kernel, render) tuple where:
        - kernel: NVUniversalGemmKernel object with call_kernel() method
        - render: function that returns source code string
        r   )NVUniversalGemmKernel)Placeholder)r  r   r|  r  r  r  r  r  r  r  r  c                     s      S r   )r  r   render_kernelr   r   r    r  z9NVUniversalGemmBuffer._make_kernel_render.<locals>.render)Btorch._inductor.codegen.nv_universal_gemm.nv_universal_gemm_kernelr  torch._inductor.utilsr  r~  r   r   rf  r  r  r   KERNEL_NAMEr  r  r  r  r  r  r  r  )	r  r  r  r  r  r   r  r  r  r   r  r   r    s2   




z)NVUniversalGemmBuffer._make_kernel_render)r   NNNN)rg  r  r~  r  r  r   r  r   r  r   r  r   r  r  r  r  r  r  r  r  r   r   rf  r  r   )r  r   r  r  r   r  )	r   r   r   rk  r  r  r  r  r  r   r   r   r   r    s    
!
r  r   #Sequence[IRNode | Sequence[IRNode]]TypeIs[Sequence[IRNode]]c                 C  s   t dd | D S )Nc                 s  r  r   r   r   r  r   r   r   r     r  z#is_node_sequence.<locals>.<genexpr>)r   )r   r   r   r   is_node_sequence  r  r  c                   @  sz   e Zd ZU ded< d!ddZd"d
dZd#ddZed$ddZe	d%ddZ
d&ddZd'ddZed 	d(d)ddZd S )*r  r  r~  r   r   r   r   c                 C  s    | j | }t|tsJ | S r   r~  r   r   r  )r  r   inputr   r   r   
input_name  s   
zInputsKernel.input_namer5  c                   s   t tj  }tj | jD ]#}t|tr | fdd|D  qt|tr&q|	 |
  qt tj  fdd|  D }tj||t  dS )Nc                 3      | ]	} |  V  qd S r   r  rt  r  r   r   r      rF  z/InputsKernel.get_read_writes.<locals>.<genexpr>c                 3  r  r   r  r&  r  r   r   r   '      
r  )r=   rF   rL   r  r~  r   r   rl  r   r  r  r  r  )r  r:  r  r  r   r  r   r6    s    


zInputsKernel.get_read_writesr7  c                 C  r8  r   r9  r  r   r   r   r  1  r  zInputsKernel.get_readsr   r   c                 C  s~   t |tr|j}t |tr|j}t |trt |tst|}t |tr)| |S t |t	r0|S t |t
tfs=J t||S r   )r   r   rf  r  r  rl  r  rM  unwrap_storage_for_inputTorchBindObjectr  r   r  r   r   r   r   r  4  s   





z%InputsKernel.unwrap_storage_for_inputlist[IRNode | Sequence[IRNode]]c                 C  s@   g }| D ]}t |trdd |D }nt|}|| q|S )Nc                 S  rz  r   )r  r  r   r   r   r   r   N  r   z/InputsKernel.unwrap_storage.<locals>.<listcomp>)r   r   r  r  r  )r~  
inputs_newr   r   r   r   r(  G  s   

zInputsKernel.unwrap_storager   c                 C  r  r  r   r  r   r   r   rA  T  r  zInputsKernel.is_externc                 C  r  r8  r   r  r   r   r   r;  W  r  zInputsKernel.num_readsFr  r   c                 C  sN   t tj  }| jD ]}t|tr|||O }q	|D ]	}|||O }qq	|S r   )r=   r   r    r~  r   r   r>  )r  r  r  r  	inner_inpr   r   r   r>  Z  s   

z!InputsKernel.get_free_symbol_usesN)r   r   r   r   rd  re  r  )r~  r  r   r  rR  rf  r_  rh  )r   r   r   r   r  r6  r  r  r  rn  r(  rA  r;  r]   r>  r   r   r   r   r    s   
 




r  c                   @  s    e Zd Zd	ddZd
ddZdS )	NopKernelr   r   c                 C  r  r  r   r  r   r   r   rB  i  r  zNopKernel.is_no_opr7  c                 C  r  r   r<   r  r   r   r   r  l  r  zNopKernel.get_readsNrR  re  )r   r   r   rB  r  r   r   r   r   r  h  s    
r  c                   @  s^   e Zd ZdZeddd	Ze	
ddddZed 	ddddZed ddZ	d!ddZ
d
S )"ConcatKernelzn
    There isn't actually a real kernel for concat, we just change the
    storage for the upstream data.
    r~  r  rY  r   r   r  c                 C  sz  |d   }|d  }t|d  }dg}|| g}d|  kr)t|k s,J  J tdt|D ]Z}||  }	|||  t|	t|ksLJ ||  |ksVJ ||   |ks`J tt|D ]}
|
|krw||
 |	|
  ||
< qftjj	
||
 |	|
 ||
< qf|||  q3t|}tjrt|||d j}tt|D ]!}|| }t|r| }t|trt|j|jrt|} nqtdd |D }tjjjd }|du rt|trtdd |D rt|}tdd |D }|dusJ tdt|||||d	g d
}t|}g }t |D ]{\}}t|t!t"fs'J t#|| $|t%j&|||| || dd}t|t'sEJ t#|t|j(tsSJ t#|j(|j(| t|j)t!rf|j)* }n|j)}t|tr|+ r|   }durt,|j#rt-|s||.  qt|dkrtj/|t0j1rtj2| tj3||_4| 5|j(|_(tj6| |S )z6
        Create the concat kernel from inputs
        r   rD   c                 s  r  r   )r"  rt  r   r   r   r     r  z&ConcatKernel.create.<locals>.<genexpr>Fc                 s  sB    | ]}d |j v o|j d  jtjdp|j d  jtjdV  qdS )r  rZ  N)rz  rI  r)  r\  r^  r   argr   r   r   r     s    


c                 s  s"    | ]}t |o| jV  qd S r   )r"  r#  rk  rt  r   r   r   r     s    
N)r  r  r  r  rk  r   rg  r~  r  )7r'  r&  r   r!  r   r   r  rr   r  r  r  r$  r%  rE   rK  r  rJ  r  r"  r#  r   ri  r)  r  r  r.   r  current_noder   r   r  r  r  r  r  r   r  r  r  r  r~  rf  r  r!  ri   rh   rE  rS  rG   FOREACHregister_operation_listr)  r   r(  r*  )r  r~  rY  r  r  rq  offsets_startoffsets_endr   
input_sizern  output_strider   rg  any_input_is_storage_and_layoutfx_node_argsrk  concat_kernelr  op_namesr  r}  input_unwrappeddevr   r   r   r  v  s   
 

 zConcatKernel.createNr  r   r  r  r   c                 C  s   t |tr| |j|S t |ttfsJ t|t |jtrPt |jjt	r*|jj
s,dS |d u r2dS t| t| kr@dS tdd t| | D S t|jdoct |jjtoct |jt S )NFTc                 s  r`  r   ra  rc  r   r   r   r     rd  z=ConcatKernel.can_realize_into_without_copy.<locals>.<genexpr>rg  )r   r   can_realize_into_without_copyrf  r  r  r   r  rg  ri  r  r   rK  r   r   r  r$  ExternKernelAlloc)r  r  r  r   r   r   r    s*   
z*ConcatKernel.can_realize_into_without_copyFr  r   c                 C  s   t | |S r   )r  r>  r=  r   r   r   r>    r  z!ConcatKernel.get_free_symbol_usesc              	   C  s   t |tst|rt|\}}t||d}t |ts J t|t |tr,| |j|S t |trL|	  t
|jds=J | ||rLt||j_|jS tj| | | dd t| | D d}| ||S )Nre  rg  c                 S  r  r   r  r  r   r   r   r   3  r  z-ConcatKernel.realize_into.<locals>.<listcomp>r  )r   rl  r"  rh  r   r   r  rf  r  r	  r  r  ru  rg  r  r  r'  r&  r  r   r!  )r  r  r  rm  rg  pwr   r   r   r    s,   


	zConcatKernel.realize_intoc                 C  r  r  r   r  r   r   r   r  :  r  zConcatKernel.should_allocate)r~  r  rY  r   r   r  r   )r  r   r  r  r   r   r_  rh  )r  r   r  r   r   r   rR  )r   r   r   rk  r  r  r  r]   r>  r  r  r   r   r   r   r  p  s    w#!r  c                      s  e Zd ZU dZdZded< ejedZ	ded< dZ
d	ed
< dZded< dZded< ejedZded< dZded< dZded< ejedZded< dZded< ejedZded< ejedZded< 							dd fd$d%Zdd'd(Zdd*d+Zdd,d-Zdd.d/Z	ddd3d4Zdd5d6Zddd7d8Zdd9d:Zdd;d<Zdd>d?Ze ddCdDZ!e"ddJdKZ#e"ddMdNZ$e"ddOdPZ%e"ddQdRZ&e"			SdddZd[Z'e"	Sddd]d^Z(e"	Sddd`daZ)e"ddbdcZ*e"ddddeZ+e"ddfdgZ,e"ddhdiZ-ddjdkZ.ddldmZ/dddqdrZ0ddsdtZ1ddvdwZ2dddydzZ3dd{d|Z4dd}d~Z5dddZ6dddZ7dddZ8dddZ9e:d 	SddddZ;dddZ<e<Z=  Z>S )r  z
    A class that represents Kernels which are not directly lowered to Inductor
    Loop Level IR, such as custom operators, or aten operators which we fallback to.
    r   r6  constant_argsr  r  r   NReinterpretView | Noneoutput_viewr3  python_kernel_namecpp_kernel_nameIterable[str]ordered_kwargs_for_cpp_kernel_OpOverloads | Noneop_overloadzlist[dict[str, Any]] | Nonearg_propertieszdict[str, dict[str, Any]]allarg_propertiesz dict[str, dict[str, Any]] | Nonekwarg_propertiesz"dict[sympy.Symbol, pytree.KeyPath]unbacked_bindingszlist[MutationOutput]r.  r   rg  r6  r~  r  dict[str, Any] | Noner   r   c                   st   t  j|||d || _|r|ni | _|| _|
| _| | | | |	| _| 	  i | _
g | _tjj| _i | _d S Nr  )r  r  r  r   r  r  set_cpp_kernel_nameset_python_kernel_namer  collect_arg_kwarg_propertiesr  r.  rr   r  r  fx_noder   )r  r   rg  r~  r  r   r  r  r  r  r  r   r   r   r  Z  s"   



zExternKernel.__init__r  c                 C  rn  r   ro  r  r   r   r   r  z  r   zExternKernel.get_outputsr   c                 C  r  r   r<   r  r   r   r   r  }  r  z%ExternKernel.get_unbacked_symbol_defsc                 C  s   t | jtjjrdd | jjjD ndd tt| j	D | _
t | jtjjr1dd | jjjD ni | _t | jtjjrW| jsJdd | jjjD | _dd | jjjD | _d S g | _d S )Nc                 S  s$   g | ]}|j s|j|j|jd qS ))r   r   r  )
kwarg_onlyr   	real_typer  rt  r   r   r   r     s    z=ExternKernel.collect_arg_kwarg_properties.<locals>.<listcomp>c                 S  s   g | ]}i qS r   r   r   r   r   r   r     r  c                 S  s   i | ]}|j |j|jd qS ))r   r  )r   r  r  rt  r   r   r   r     r  z=ExternKernel.collect_arg_kwarg_properties.<locals>.<dictcomp>c                 S     g | ]}|j r|jqS r   r  r   rt  r   r   r   r     
    c                 S  s   g | ]}|j r|qS r   )r  rt  r   r   r   r     s
    )r   r  r)  _ops
OpOverload_schema	argumentsr   r   r~  r  r  r  schema_kwargsr  r   r   r   r    s*   


z)ExternKernel.collect_arg_kwarg_propertiesc                 C  s$   t | jtr|   |   d S d S r   )r   rg  r$  apply_constraintr)  r  r   r   r   rJ    s   zExternKernel.decide_layoutwrapperrw   r  c                 C  sZ   t | |\}}|r|| |s|  }|r+ddlm} || |dd}||| d S d S )NrD   )'set_kernel_post_grad_provenance_tracingT)rA  )re   make_commenttry_get_kernel_namer  r  write_provenance_debug_handle)r  r  r  
origin_str_detailed_origin_strr  debug_handler   r   r   codegen_comment  s   
zExternKernel.codegen_commentc                 C  ru  r   rv  r  r  r   r   r   codegen  r  zExternKernel.codegenc                 C  s   || _ tjjrt| jtjjsd S | j}| j d u rh|j	dkra|j
dkr+|jdd n|jdd}ddlm} |d|j i }tdd	 |D d
d}|d
krX| d| }d| d| _ d S |jj| _ d S d S )Natenr  .r   r
  inductor_fallback_opszaten.c                 s  s*    | ]}| d rt|dd V  qdS )r  rD   N)
startswithr   r  r   r   r   r     s   ( z3ExternKernel.set_cpp_kernel_name.<locals>.<genexpr>rD   r  _vz
at::_ops::z::call)r  rr   r  cpp_wrapperr   r  r)  r
  r  	namespace_overloadnamer   r  replacetorchgen.aoti.fallback_opsr  r  r  r  r   )r  r  r  opnamer  version_infolatest_versionr   r   r   r    s.   




	z ExternKernel.set_cpp_kernel_namec                 C  sd   || _ |d ur	d S | j}|d u rd S t|tjjr"d|j | _ d S |jdd d|j | _ d S )Nztorch.ops.higher_order.._ops..ops.r  )	r  r  r   r)  r
  HigherOrderOperatorr   r   r$  )r  r  r  r   r   r   r    s   z#ExternKernel.set_python_kernel_namec                 C  s   ddl m} |   }r|jntjj}tjjr| jS tjj	r=t
tjj|s-J ttjj| jd u r4d S tjj| j|S | jS )NrD   )CppWrapperCpu)codegen.cpp_wrapper_cpur,  r'  r   rr   r  device_type
fx_wrapperr  r!  r   r  r  get_c_shim_func_name)r  r,  dr  r   r   r   r    s   
z ExternKernel.try_get_kernel_namer   c                 C  r  r   )r  r  r   r   r   get_kernel_name  r  zExternKernel.get_kernel_namer   r   r   c                 C  s:   t j|  |  |  |  |  |  d}|  |S )N)r  r  r  r  r  r  )	r  r  r'  r&  r  r!  r  r  r	  )r   r  r   r   r   
copy_input	  s   zExternKernel.copy_inputr  r   r   r   ftuple[Any, list[Any], list[Any], Callable[[Any, Any], Any], dict[sympy.Symbol, pytree.KeyPath] | None]c                   s  ||d}t |\}g  g }g }g }|D ]}	|	 td r:  tjjjj|	dd}
 d ||
 ||
 q  t	d rh   d ||	 |	j
j}|	j
jdkrZ|dus\J |tjj|   q  td r   d ||	 ||	j q td r  d ||	 q 	  d ||	 ||	 qd fdd}fdd|D }|D ]}t|rt|dd qg }|D ]}}t|ts| tjjv r|tjj|   qt|ts| tjjv r|tjj|   qt|tr
||  qt|tr||j qt|tjj j	r>|j
j}|j
jdkr0|dus2J |tjj|   q|t!| q|||\}}||i |}d}tj"j }rtj#j$%d}t& }tj#j'tj(j)j*u rz|d }t+tj#}| t,|tj#| W d   n	1 sw   Y  t-|||}t|t.t/fs|gn|}|D ]+}t|tj0r|j1rt2j3sd}tjj#j$%dd }r| d| }|tj_4q|||||fS )a  Partition kernel args into tensor and non-tensor, realize tensor inputs,
        re-run fake tensor propagation with the realized strides, and return
        (example_output, tensor_args, non_tensor_args, unflatten_args, unbacked_bindings).

        unflatten_args(new_tensor_args, new_non_tensor_args) reconstructs the
        original (args, kwargs) tree from replacement lists.
        r   r   N)r  Fr:  Tnew_tensor_argsr   new_non_tensor_argsr   tuple[list[_T], dict[str, _T]]c                   sd   g }t | }t |} D ]}|r|t| q|t| qt|}|dg |di fS )Nr   r   )r  r  r  pytreetree_unflattenr  )r5  r6  r  
it_tensorsit_non_tensors	is_tensorr  )args_flat_is_tensor	args_specr   r   unflatten_argsU  s   z3ExternKernel.process_kernel.<locals>.unflatten_argsc                   r,  r   rM  rt  r  r   r   r   c  r   z/ExternKernel.process_kernel.<locals>.<listcomp>rD  r  rD   zEsparsity not handled. Please file issue for sparse inference weights.r  z Found from : 
 )r5  r   r6  r   r   r7  )5r8  tree_flattenr   rr   r  r  r   create_symintnoder  GeneratorStater  r   r   r)  r:  default_generatorsclone_stateOpaqueObjectStater-  r   r"  rh  r   r  r  	constantstorchbind_constantsr  	get_valuer   opaque_example_valuer  irr  r#  r  rz  r  r	   r  _higher_order_opseffectswith_effectsr0   r7   r1   r   r   rZ  	is_sparserE   graph_partitiondisable_cudagraphs_reason)r  r  r   r   binded_args	args_flattensor_argsnon_tensor_argsreal_non_tensor_argsr  r   device_indexr?  r   example_argsnew_args
new_kwargsexample_outputr  r   node_meta_valctxexample_out_lir,  msgr  r   )r=  r>  r  r   process_kernel  s   




	








zExternKernel.process_kernelrl  c              
   C  sp  t |tsJ t|t |tr|S | }tj| }|dus$J |	 }|dur]d|j
v r]t |tttfr]t |jtr]t|j
d tjdsSt|j
d tjdr]|t|  n|  tj| dd\}}|d }| |}tjj||}tjj||}	tjj||}
t||	|
 }||krt d|	|
| t!t|j"t#|$ |% | |	|
dd	d
S )z
        In order to pass this to an extern kernel we need a
        ReinterpretView not a View.  This allows us to avoid some
        unneeded copies.
        Nr  r  r  r   r   z@convert_to_reinterpret_view failed: stride=%s offset=%s index=%sFr  re  )&r   r  r   rl  r  rr   r  r  r  r  rz  r  r  rg  r$  r,   r)  r\  r^  r1  r.   r!  r)  rF   r]  r  r  r^  stride_vars
offset_varrj   r\  r  r  rf  ri  r  r&  )r  r   x_unwrap_viewrO  x_unwrap_view_fx_node
index_argsr  rG  r   r^  rj  expectedr   r   r   convert_to_reinterpret_view  sn   




z(ExternKernel.convert_to_reinterpret_viewc                 C  s8  |d u rt  S t|ttjjjtfrt|dS t|t	r@t
  tjtj|j| | dW  d    S 1 s;w   Y  t|trG|S t|trR| |jS t|trct| |j| dS t|tr|  t| rz| |W S  ty   Y nw t|tr|  |S t|t tt!fr|S | "|S )Nr;  )r  r  re  )#r  r   r   r   r   r   r   r   r   r  r>   rr   r  add_tensor_constantr)  r]  r-  r&  r'  r  r   rM  rf  rl  r#  r  r	  r"  r  rh  r  r  NonTensorObjr   r3  r  r   r   r   rM    s@   

 





zExternKernel.realize_inputc                 C  sD   t |rt| dkr|S | D ]
}|dkr|  S q| |S r4  )r"  r   rK  r3  )r  r   r  r   r   r   require_stride1#  s   
zExternKernel.require_stride1Fr   r  r2  r  r*  r   c              	     s  |d us
 d us
J   dv r sS trt trS|rGt|o-t j }tdd|r@t	t
jj jn||d S tddd | d S t ttfr}|re |sr r}t  j r} d ur{t S S t  }trt|  }trtdt|tr|r||s rt |j rS ttr|r |sǈ rt  j rɈS ttrtjtrtjtst  }rt|drt|jtsz!| j_|r| j ||dW S  r| j! |dW S W n
 t"y   Y nw d }	 }
 d urOt
jj fd	d
t#t$ D }	|	D ]}t%j&j'(|ddqA| )tdd|| d |rkt|siJ S |	r|
d urx d uszJ t%j&j'*|
t S S )N)r   rD   TF)rE  rL  rM  r*  rP  zHthe MutationLayoutSHOULDREMOVE's real layout shouldn't be FlexibleLayoutrf  rQ  c                   s4   g | ]}  | d r | dr|qS )r   r   )rb  r  r!  r   r2  r  r   r   r   r     s    z0ExternKernel.require_strides.<locals>.<listcomp>r   rD   )+r  r"  r   r#  r$  rT  r3   r  rh  r  rr   r  r  rb  ri  ru  rS  r\  r!  rq  r  r  r  r  r   rf  r  rl  r  r  r  rh  require_stride_orderrequire_exact_stridesr  r   r   r)  r  loweringslice_r3  r|  )r  r   r   r2  r*  use_current_stride_ordermutation_layoutr  r  expanded_dims	orig_sizerY  r   rl  r   require_strides-  s  	
	
	


	


zExternKernel.require_stridesrU  c                 C  s   | j |dd |D |dS )Nc                 S  s$   g | ]}t |tjr|jjn|qS r   )r   r)  SymIntr   r  r   r   r   r   r     s    z6ExternKernel.require_exact_strides.<locals>.<listcomp>)r2  r*  ru  )r  r   r2  r*  r   r   r   rn    s   z"ExternKernel.require_exact_stridesr   c                 C  s   | j |||dS )N)r   r*  rw  )r  r   r   r*  r   r   r   rm    s   z!ExternKernel.require_stride_orderc                 C     |  |tS r   )rm  r]  r  r   r   r   require_channels_last  r[  z"ExternKernel.require_channels_lastc                 C  rx  r   )rm  r_  r  r   r   r   require_channels_last_3d  r[  z%ExternKernel.require_channels_last_3dc                 C  s,   ddd}||r|S |  |t| S )Nr   r   r   r   c              	   S  s@   z|   }W n ttfy   Y dS w |tjjv otjj| jS r  )r  AttributeErrorr  rr   r  rH  	is_mkldnn)r   r   r   r   r   is_mkldnn_tensor  s   z9ExternKernel.require_contiguous.<locals>.is_mkldnn_tensorr   r   r   r   rn  r$  r%  r!  )r  r   r}  r   r   r   r    s   
	zExternKernel.require_contiguousc                 C  s   |  |t| S r   r  r  r   r   r   require_contiguous_strides  s   z'ExternKernel.require_contiguous_stridesc                 C  r  r   r   r  r   r   r   r    r  zExternKernel.apply_constraintc                 C  s   t |tsJ t|t |tst|}| jsJ dt|}t| j}||k rStd| j||  t	||D ]}| j| d }|
||v rJ|| n| j| d  q7|S )Nz/ExternKernel.arg_properties should not be emptyzv%s has %d unprovided positional arguments. Will check if they are in the keyword arguments or will use default values.r   r  )r   r   r   r   r  r   r\  r  r  r   r  )r  r   r   n_args
n_pos_argsr   arg_namer   r   r   fill_non_provided_args  s(   

z#ExternKernel.fill_non_provided_argsr  r  r   c           	      C  s   t jjrog }d }|r"| jr"t| jt|ksJ ddd | jD }t| jD ]E\}}|d urF|d us5J ||| }|rC|dnd }nt| j| }| jr_|t| jk r_| j| dnd }|	t jj
|| q'|S dd | jD S )NzDnames passed to codegen_const_args does not match self.constant_argsc                 S  s   i | ]}| d |qS r   )r  r  r   r   r   r   8  r  z3ExternKernel.codegen_const_args.<locals>.<dictcomp>r   c                 S  r  r   rr   r  r  val_to_arg_str)r   r  r   r   r   r   K  r  z3ExternKernel.codegen_const_args.<locals>.<listcomp>)rr   r  r!  r  r   r  r  r  r~  r  r  r  )	r  r  r  name_to_arg_propertiesr   r   proptype_r   r   r   r   codegen_const_args,  s2   
zExternKernel.codegen_const_argsc                 C  s   t jjr| jd ur| g | j| j| j}d}n| j}d}g }t|D ]4\}}t jjrN| j	r6|t
| j	k s:J d| j	| d}|t jj|| q$|t jj| q$|rb||   |S )NFTz-Invalid access to ExternKernel.arg_propertiesr   )rr   r  r!  r  r  r~  r  r   r  r  r   r  r  r  r  r   r  )r  r~  need_codegen_constant_argsr   r   r   r  r   r   r   codegen_argsM  s&   zExternKernel.codegen_argsr  c                 K  sT   ||v r	| |S || jv r| j |S | j | }dur#| dS t| d)zGiven an argument name, queries for values in (in order):
        1. any provided kwargs for this function.
        2. the class self.kwargs member.
        3. any available default arguments in self.allarg_properties.Nr  z not in self.allarg_properties)r  r   r  r  )r  r  r   r  r   r   r   get_kwargs_valueg  s   


zExternKernel.get_kwargs_valueskip_outc                 C  s   t jjrO| jd urt| jdkrg S g }| jD ]5}|r |dkr q| |}t|t	r0|
| q| jd us7J | j|i d}|
t jj|| q|S dd | j D }|S )Nr   r  r   c                 S  s(   g | ]\}}| d t jj| qS r  r  )r   kr  r   r   r   r     s    z/ExternKernel.codegen_kwargs.<locals>.<listcomp>)rr   r  r!  r  r   r  r  r  r   r   r  r  r  r  r  r   r  )r  r  r   r  r  r  r   r   r   codegen_kwargst  s$   


zExternKernel.codegen_kwargsc                 C  sT   | j d ur&| j j}t|dd}|dd}|ddd }| d| }|S d}|S )	Nr   unknown_namespacer)  r*  r  rD   r   
unknown_op)r  r  r   r$  rsplit)r  r  op_namespaceop_namer   r   r   get_op_name  s   
zExternKernel.get_op_namec                 C  s   t jr=tjjs?t|  dkrd S tjj|  }tjj| 	 }| 
 }|d|   d| d| d|d	 d S d S d S )Nr   zassert_size_stride(rt  r5  )rE   size_assertsrr   r  r!  rm   r!  r  codegen_shape_tuplerK  r  r  r  )r  r  r  r  r  r   r   r   codegen_size_asserts  s    z!ExternKernel.codegen_size_assertsc              	   C  st   t jr6tjjs8|  }|tjjv}|  }|r(|d| dt	 d|d d S |d| d| d d S d S d S )Nzassert_alignment(rt  r5  z	# buffer z (op: z) is assumed to be not aligned)
rE   alignment_assertsrr   r  r!  r  rW  r  r  rf   )r  r  r   alignedr  r   r   r   codegen_alignment_asserts  s   z&ExternKernel.codegen_alignment_assertsc                 C  s@   t jjrtjjr
dS |  |  }|d| d| d dS )zc
        Track outputs of fallback operators if config.test_configs.track_memory_lifecycle
        Nztrack_tensor(z, 'z'))	rE   test_configstrack_memory_lifecyclerr   r  r!  "write_memory_track_allocation_oncer  r  )r  r  r   r   r   r   codegen_memory_tracking  s
   z$ExternKernel.codegen_memory_tracking'tuple[list[Sequence[Expr]], list[Expr]]c                 C  s   |   }|  }|g g|fS )zD
        get output sizes and strides, for template_codegen
        )r!  rK  )r  r  r  r   r   r   get_group_stride  s   zExternKernel.get_group_stridetuple[Expr, Sequence[Expr]]c                   s  t jj|  }|  }fdd|D }dd tt|D ttt||jdd}dd t	|D fddttD }fd	d|D | 
 }|}t jj||g\}}}	td
\}
 tt| fdd|D }tt||}|t|fS )zC
        Manually get canonicalization of the output index
        c                   r,  r   )r   rt  )r  r   r   r     r   z-ExternKernel.canonicalize.<locals>.<listcomp>c                 S  s   g | ]	}t d | qS )r1  )rk   r   r   r   r   r     r  T)rC  r  c                 S  r   r   r   r   r   r   r   r     r   z-ExternKernel.canonicalize.<locals>.<dictcomp>c                   r   r   r   r   r  r   r   r     r  c                   r   r   r   r   )r  r   r   r     r  cc                   r  r   r   rt  )add_varr   r   r     r  )rr   r  r  r!  rK  r   r   rH  rc  r  r  r  rQ   r   r   rn   r   r|  r   )r  rV  r^  index_orderr   r  r   	new_sizesr   r  r
  replacementr   )r  r  r  r  r   canonicalize  s$   
 zExternKernel.canonicalizer  c                 C  sP   |rt nt}t| |}| jD ]}|||O }q| j D ]}|||O }q|S r   )maybe_free_unbacked_symbolsmaybe_free_symbolsr  r>  r  r   r   )r  r  maybe_get_symbolsr  r  r   r   r   r>    s   

z!ExternKernel.get_free_symbol_usesc                   sP   t  dd }d|g}| fddt D 7 }|d j  |S )Nr  zpython_kernel_name=c                   s$   g | ]}|j  d t |j  qS r  )r   r   )r   rm  r  r   r   r     s    z(ExternKernel.__str__.<locals>.<listcomp>r  )r   rl  fieldsr  r  r  )r  r  r  r   r  r   r    s   
zExternKernel.__str__r   NNNNr   N)r   r3  rg  r6  r~  r  r  r6  r   r  r  r  r  r3  r  r3  r  r  r  r  r   r   r  r  rI  r   )r  rw   r  r3  r   r   r  rw   r   r   r  r3  r   r   )r  r3  r   r   rU  r\  )r   r   r   r   )r  r   r   r   r   r   r   r4  )r   r   r   rl  r  )NNF)
r   r   r   r  r2  r  r*  r   r   r   r_  )r   r   r2  rU  r*  r   r   r   )r   r   r   r   r*  r   r   r   )r   r6  r   r  r   r6  )r  r  r   r   r   r   )r  r   r   r   r   r   )r  r   r   r   )r   r  )r   r  rh  )?r   r   r   rk  r  r   rl  rm  r   r   r  r  r  r   r  r  r  r  r  r  r.  r  r  r  r  rJ  r  r  r  r  r  r2  rn  r3  r  ra  rh  rM  rk  ru  rn  rm  ry  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r]   r>  r  r  r  r   r   r   r   r  >  s   
 
 


%

%

 !G#	 $

$
!






	!
r  c                      sB   e Zd ZdddZ							dd  fddZd!ddZ  ZS )"ExternKernelOutr  rw   r   r   c                 C     | |  d S r   )generate_extern_kernel_outr  r   r   r   r  	  r   zExternKernelOut.codegenr   Nrg  r  r~  r  r  r6  r   r  r  r  r  r3  r  r  r  r  c
                   s`   |  |}
t|
tsJ t|
t d ||
||pi d ||||	
 tj| | _	tj
|  d S r   )r(  r   r   r   r  r  rr   r  r)  r   r*  )r  rg  r~  r  r   r  r  r  r  r  unwrapped_inputsr   r   r   r    s    
zExternKernelOut.__init__r   c                 C  r  r  r   r  r   r   r   r  )  r  zExternKernelOut.should_allocater  r  )rg  r  r~  r  r  r6  r   r  r  r  r  r3  r  r3  r  r6  r  r  r   r   rR  )r   r   r   r  r  r  r  r   r   r   r   r    s    
r  c                         e Zd Zd	 fddZ  ZS )
RandomSeedsrf  r   r  r  r   r   c                   sF   t t j}t jt|t j|gdg |j|j|ggddtj	j
d d S )Nr<  zaten.randint.low_outzat::_ops::randint_low_out::call)rg  r~  r  r  r  r  )r)  r  r~  r  r  ri  r  r  r  randintlow_out)r  rf  r  limitsr   r   r   r  .  s   
zRandomSeeds.__init__)rf  r   r  r  r   r   r   r   r   r  r  r   r   r   r   r  -      r  c                      sJ   e Zd ZdddZ						dd  fddZd!ddZd"ddZ  ZS )#r  r  rw   r   r   c                 C  r  r   )generate_extern_kernel_allocr  r   r   r   r  B  r   zExternKernelAlloc.codegenr   Nrg  r6  r~  r  r  r6  r   r  r  r3  r  r  r  r  c	           
        sp   |  |}	tdd |	D sJ t d |ttt |	||pi d ||||
 g | _tj	
| | _tj	|  d S )Nc                 s  r  r   r  r   r   r   r   r   Q  r  z-ExternKernelAlloc.__init__.<locals>.<genexpr>)r(  r   r  r  r   r   r   r  rr   r  r)  r   r*  )
r  rg  r~  r  r   r  r  r  r  r  r   r   r   r  E  s"   
zExternKernelAlloc.__init__r   c                 C  r  r  r   r  r   r   r   r  e  r  z!ExternKernelAlloc.should_allocatec                 C  ru  r   rv  r  r   r   r   r  h  r  z"ExternKernelAlloc.apply_constraintr  )r   NNNr   N)rg  r6  r~  r  r  r6  r   r  r  r3  r  r3  r  r6  r  r  r   r   rR  rI  )r   r   r   r  r  r  r  r  r   r   r   r   r  A  s    

 r  c                      sJ   e Zd ZdZd fd
dZdddZdddZdddZdddZ  Z	S )r%  zP
    An output buffer that represents the mutation of a pre-existing buffer
    rg  r6  mutated_noder   mutating_noderr  r   r   c                   sD   t  jd |d | }tj| |g| _|| _tj| | _	d S r  )
r  r  r  rr   r  r  mutation_namesr  r)  r   )r  rg  r  r  mutated_node_namer   r   r   r  q  s   zMutationOutput.__init__c                 C  r  r   )r  r  r   r   r   r  {  r  zMutationOutput.get_defining_opr  c                 C  r  r   )r  r  r   r   r   rD  ~  r  z!MutationOutput.get_mutation_namesr   c                 C  r  r  r   r  r   r   r   r    r  zMutationOutput.should_allocater  c                 C  s    |   }dd dd |D D S )Nc                 S     g | ]}|d ur|qS r   r   r&  r   r   r   r     s
    z7MutationOutput.get_mutation_buffers.<locals>.<listcomp>c                 s  s    | ]	}t j|V  qd S r   )rr   r  try_get_bufferr  r   r   r   r     rF  z6MutationOutput.get_mutation_buffers.<locals>.<genexpr>)rD  )r  r  r   r   r   get_mutation_buffers  s   z#MutationOutput.get_mutation_buffers)rg  r6  r  r   r  rr  r   r   r  rj  rR  r   r  )
r   r   r   rk  r  r  rD  r  r  r  r   r   r   r   r%  l  s    



r%  c                      s`   e Zd ZU dZi Zded< edd	d
ZedddZd fddZ	dddZ
dddZ  ZS )TMADescriptorad  
    An IR node representing a generic host-side TMA descriptor in the Triton API
    Mostly useful for user-defined Triton kernels relying on host-side TMA;
    but can, in principle, be used for Inductor's Triton templates, too.

    See TMADescriptorExperimental and TMADescriptorStable for the two implementations
    (the old API and the new API)
    zdict[Any, TMADescriptor]_CACHEr]  r   tma_metatuple[str, tuple[Any, ...]]r   c                 C  sT   t |dksJ |d dkrt|g|d R  S |d dks J t|g|d R  S )Nr   r   experimentalrD   r9  )r   TMADescriptorExperimentalTMADescriptorStable)r  r]  r  r   r   r   _create_impl  s
   zTMADescriptor._create_implc                 C  s2   t ||f}|| jvr| ||| j|< | j| S r   )r[  r  r  )r  r]  r  rC  r   r   r   r    s   

zTMADescriptor.creater~  r6  r  r   c                   sV   t  d tt|| dttt |t|d  || _	t
j| | _t
j|  d S )Nre  )r  r  ru  rl  r#  r   r   r  r   r]  rr   r  r)  r   r*  )r  r]  r~  r  r   r   r   r    s   zTMADescriptor.__init__r  rw   c                 C  r  r   )generate_tma_descriptorr  r   r   r   r    r   zTMADescriptor.codegenc                 C  r  r   )r]  r  r   r   r   
get_tensor  r  zTMADescriptor.get_tensor)r]  r   r  r  r   r  )r]  r   r~  r6  r  r6  r   r   r  rH  )r   r   r   rk  r  r   r  r  r  r  r  r  r  r   r   r   r   r    s   
 

r  c                      s&   e Zd ZdZ	dd fddZ  ZS )r  z
    the new host-side TMA Descriptor API:
    (the ones obtained via create_{1d,2d}_tma_descriptor calls).

    See also TMADescriptorStable for the new API.
    Nr]  r   r  list[int | torch.SymInt]
block_dimselement_sizer  r   r   c                   s   t |dv sJ t |t |ksJ |d u r| j}|| _|| _|| _t | j| _|g}g | j| j| j}t j|||d d S )N)rD   r   r]  r~  r  )	r   r&  r  r  r  r  rG  r  r  )r  r]  r  r  r  r~  r  r   r   r   r    s*   

z"TMADescriptorExperimental.__init__r   )
r]  r   r  r  r  r  r  r  r   r   r   r   r   rk  r  r  r   r   r   r   r    s    r  c                      s"   e Zd ZdZd fddZ  ZS )	r  z
    the new host-side TMA descriptor API
    (the ones obtained via TensorDescriptor.from_tensor).

    See also TMADescriptorExperimental for the old API.
    r]  r   block_shaper  c                   s   || _ t j||g|d d S )Nr  )r  r  r  )r  r]  r  r   r   r   r    s   
zTMADescriptorStable.__init__)r]  r   r  r  r  r   r   r   r   r    s    r  c                      s,   e Zd Z	dd fddZdddZ  ZS )SubgraphBufferNrg  r  r   r  rr  rs  example_inputs	list[Any]subgraph_namer   rv  r  c              	     sb  t  d || || _|| _tj| | _tj|  tj	| j||| _
t| js,J t| j}|D ]}|| j
j|j< | j
j|j q3dd |D | _dd lm  m}	 t| j
K dddd}
i |
|pii }|	| | j
j| j  W d    n1 sw   Y  |r| j
jD ]}||  qW d    d S W d    d S 1 sw   Y  d S )Nc                 S     g | ]}|j qS r   r   )r   sym_varr   r   r   r          z+SubgraphBuffer.__init__.<locals>.<listcomp>r   FATEN)max_autotunemax_autotune_gemmmax_autotune_gemm_backends)r  r  rr  r  rr   r  r)  r   r*  make_subgraphsubgraphr  r~  r  r   graph_input_namesr  
sym_inputstorch._inductor.configr  rE   set_graph_handlerr   run
operationsr  r  )r  rg  r   rr  r  r  rv  r  sym_inpinductor_configbase_patchesmerged_patchesrv  r   r   r   r    s:   	
"zSubgraphBuffer.__init__r  rw   r   r   c                 C  sR   G dd d}t | jsJ dd | jD }||| jg | j|| jg d S )Nc                   @  r  )z,SubgraphBuffer.codegen.<locals>.CodegenGraphr  rx   c                 S  s   || _ |j| _d S r   )r  r   )r  r  r   r   r   r  8  s   z5SubgraphBuffer.codegen.<locals>.CodegenGraph.__init__N)r  rx   )r   r   r   r  r   r   r   r   CodegenGraph7  r  r  c                 S     g | ]}|  qS r   r  r   r,  r   r   r   r   =  r  z*SubgraphBuffer.codegen.<locals>.<listcomp>)r  r~  'codegen_subgraph_with_flattened_outputsr  r  r   )r  r  r  outer_inputsr   r   r   r  6  s   zSubgraphBuffer.codegenr   )rg  r  r   r  rr  rs  r  r  r  r   rv  r  r  r   r   r   r  r  r  r   r   r   r   r    s    .r  c                      s   e Zd ZdZd.ddZd/ddZed0ddZd1ddZd2ddZ	e
d 	d3d4 fddZd5ddZd6 fd#d$Zed7 fd&d'Zd8d)d*Zd9d,d-Z  ZS ):UserDefinedTritonKernelz>
    A user-defined triton kernel (e.g. via @triton.jit).
    r   (tuple[Kernel, Any, list[str], list[str]]c                   s   ddl m} ddlm} || j g }g }g }t |ret dr0| fdd j	D  nt ds7J | j
 t drR jD ]}| jj|  qEnt d	sYJ | j  j} j  |||fS )
Nr   )	Autotuner)kernel_side_tablerestore_idxc                 3  s    | ]	} j j| V  qd S r   )r   	arg_namesr   r  r   r   r   X  r  zBUserDefinedTritonKernel.get_kernel_and_metadata.<locals>.<genexpr>restore_value	reset_idxreset_to_zero)triton.runtime.autotunerr  *torch._higher_order_ops.triton_kernel_wrapr  
get_kernel
kernel_idxr   r  r   r  r  r  r  r   r  r  configs)r  r  r  r  restore_value_argsreset_to_zero_argsr   r   r  r   get_kernel_and_metadataK  s,   




z/UserDefinedTritonKernel.get_kernel_and_metadatar   c                 C  s   t jsdS | jjsdS t| jjdkrdS t| jdksJ t| jd t	s(dS t| jd j
ts3dS t| jd j
j
ts?dS t| jd j
j
j
tsLdS tdd | jd j
j
j
jD s^dS dS )aE  
        For kernels like

        @triton.jit
        def add_kernel(in_ptr0, in_ptr1, out_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
            pid = tl.program_id(0)
            offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
            mask = offs < n_elements
            x = tl.load(in_ptr0 + offs, mask=mask)
            y = tl.load(in_ptr1 + offs, mask=mask)
            tl.store(out_ptr + offs, x + y, mask=mask)

        @torch.compile
        def fn(a, b):
            out = torch.empty_like(a)
            grid = (triton.cdiv(a.numel(), 1024),)
            add_kernel[grid](a, b, out, a.numel(), BLOCK_SIZE=1024)
            return out.relu()

        We can potentially fuse the relu epilogue into the add_kernel.
        We do this by pruning the `out` tensor allocation and directly writing the relu-output.
        FrD   r   c                 s  r  rF  r   r?  r   r   r   r     r  z<UserDefinedTritonKernel.can_fuse_epilogue.<locals>.<genexpr>T)rE   *epilogue_fusion_user_defined_triton_kernelarg_accessescan_fuse_epiloguer   kernel_storesstoresmutable_argsr   r   rf  r  r  r  r   r  r  r   r   r   r  k  s$    z)UserDefinedTritonKernel.can_fuse_epiloguer  rw   r   c                 C  s   | j |d dS )N)epilogue_fusion_codegenr  r   r   r   r    r  zUserDefinedTritonKernel.codegenr  tuple[ComputedBuffer, str]c                 C  s   |  ||S )zQ
        epilogue_fusion: (fused epilogue node, modified kerel src code)
        r  )r  r  r  r   r   r   codegen_with_epilogue_fusion  s   z4UserDefinedTritonKernel.codegen_with_epilogue_fusion!tuple[ComputedBuffer, str] | Nonec                   s$  ddl m}  \}}}}|||j||j|\}}	}
}fddjD }|rNtjj	j
dks6J ttjj	j
j}||v sFJ |\}}|||< dd |jD  dd |jD }t fd	d
|D }g }g }g }g }t| ttd|D ]w\}}||v r| rq|| || t|tr||  ||  qt|ttttjfr|| |t| q||v r|d |t q|du r	 | r|d |t q|   |   qt!dt| d| "|| |j#||||||	|
d$ j%jd
 dS )YOverrides the parent member.
        See https://github.com/pytorch/pytorch/issues/151692r   )triton_version_uses_attrs_dictc                   s   i | ]}|  |qS r   r  r   r  r  r   r   r     r  z4UserDefinedTritonKernel._codegen.<locals>.<dictcomp>rD   c                 S  r  r   r   r   r  r   r   r   r     r  z4UserDefinedTritonKernel._codegen.<locals>.<listcomp>c                 S  r  r   )is_constexprnumr  r   r   r   r     r   c                 3  s    | ]} | V  qd S r   r   r   )r  r   r   r     r  z3UserDefinedTritonKernel._codegen.<locals>.<genexpr>r  rI  NzUnsupported arg type: r4  T)	arg_typesraw_argsraw_keystriton_metainductor_metar=  r  original_fxnode_name)&r  r  r  !define_user_defined_triton_kernelr   gridr  r   r  rL  r  r  r  r   paramsr=   rv  r?  r  r   repeatr  r   r   r  r&  r   r  r   r   r   r   r  r  r  generate_kernel_callr'  r  )r  r  r  r  r  r  r  r  new_namer  r  extra_launch_args
named_argsmutable_arg_nameepilogue_computed_bufferr
  
constexprsconstexpr_namesr   r  raw_keys_filteredraw_args_filteredr   r  r   )r  r  r   r    s   	






	


z UserDefinedTritonKernel._codegenFr  r   c                   s   t  |t| j|B S r   )r  r>  r&   r   r=  r   r   r   r>    s   z,UserDefinedTritonKernel.get_free_symbol_usesc                 C  r  r   r<   r  r   r   r   r     r  z0UserDefinedTritonKernel.get_unbacked_symbol_defsr  r   r   r   tma_descriptor_metadatar  kernel_argsc                  s  g }i }g }   D ]0\}}	t|	tr1t|	}
||v r't|
|| }
||
 |
||< q
||	 |	||< q
t	|dksCJ |d 
 _t|tsUJ t|t d tjd|t|| |_|_ \}}}}t|dszJ  fdd|jD _ddlm}m} t	|dkr|d jni }dd l}|j_|j_ |j_! _"||i  ||_# fddj#j$j%D _&fddj&D _'t(j)* d S )	Nr   r  r  c                   s   g | ]}| v r|qS r   r   r  r.  r   r   r   L  r  z4UserDefinedTritonKernel.__init__.<locals>.<listcomp>)identify_accessed_tensorsidentify_triton_storesc                   s(   g | ]}t  |jtr |j qS r   )r   r  r   r   r   rC  r/  r   r   r   g  s    c                   s    g | ]}t t jd | qS r#  )r%  r  r  r&  r  r   r   r   m      )+r  r   r   r  r  rM  r  r  r  r   r'  r  r   r   r  r  r  r   r  r   r  r  r  r  r  r0  r1  r   astr  
kernel_srcparse
kernel_astr	  r.  r  rL  r  r  r.  rr   r  r*  )r  r  r   r-  r.  r~  r   r  r  r  r,  r  r  r
  r0  r1  autotuned_kwargsr4  r   )r.  r  r   r  #  s`   









z UserDefinedTritonKernel.__init__r5  c                   s   t jst  S fddjjjD  tjjj}dd t	|j
D tjt fddjjjD tfddjjjD t d}|S )Nc                   s    i | ]}|j  j|j   qS r   )r   r.  r  )r   formal_arg_depr  r   r   r   {  r3  z;UserDefinedTritonKernel.get_read_writes.<locals>.<dictcomp>c                 S  s   i | ]
\}}|j | qS r   )r   r  )r   r9  
mut_outputr   r   r   r     s    
c                      g | ]}|  qS r   renamer  )read_renamesr   r   r         z;UserDefinedTritonKernel.get_read_writes.<locals>.<listcomp>c                   r;  r   r<  r  )write_renamesr   r   r     r?  r  )rE   r  r  r6  r  rL  r:  r   r  r   r.  rF   r  r=   )r  formal_arg_writesrL  r   )r>  r  r@  r   r6  s  s2   



z'UserDefinedTritonKernel.get_read_writesr  c                 C  
   t | jS r   )r   r.  r  r   r   r   r    r  z#UserDefinedTritonKernel.get_outputsr  c                 C  r  r   r  r  r   r   r   r'    r  z"UserDefinedTritonKernel.get_device)r   r  rR  r  )r  rw   r  r  r   r   )r  rw   r  r  r   r   r_  rh  r  )
r  r   r   r   r-  r  r.  r  r   r   rd  r  rW  )r   r   r   rk  r  r  r   r  r  r  r]   r>  r  r  r6  r  r'  r  r   r   r   r   r  F  s"    

 8

g
	P
%r  c                      sJ   e Zd ZdZdddZdd	d
ZdddZdddZd fddZ  Z	S )InplaceBernoulliFallbackE
    This needs to be a custom class to handle mutation properly
    r  rw   r   r   c                 C  s   t dd | jD sJ dd | jD \}tjjr5||   d| ddtt	| j
 d|j  d S ||   d| ddtt	| j
 d|j  d S )Nc                 s  r  r   r  r  r   r   r   r     r  z3InplaceBernoulliFallback.codegen.<locals>.<genexpr>c                 s  s    | ]
}t t| V  qd S r   )r   r   r  r  r   r   r   r     r  r  rt  z, NULL)r5  )r   r~  rr   r  r!  r  r2  r  r  reprr  ending)r  r  r   r   r   r   r    s   ,,z InplaceBernoulliFallback.codegenr   c                 C  r  r  r   r  r   r   r   r    r  z(InplaceBernoulliFallback.should_allocater  c                 C     |  dgS r  r  r  r   r   r   rD    r   z+InplaceBernoulliFallback.get_mutation_namesr   c                 C  r  r   r<   r  r   r   r   r    r  z1InplaceBernoulliFallback.get_unbacked_symbol_defsr  r   r   r   r  r   c                   sV   t  jd t| d| |g||d tj|  tj	| | _
tj|  d S )Nr  r  )r  r  r  r'  r(  rr   r  r  r  r)  r   r*  )r  r  r   r  r   r   r   r    s   
z!InplaceBernoulliFallback.__init__r  rR  rj  r  )r  r   r   r   r  r   r   r   
r   r   r   rk  r  r  rD  r  r  r  r   r   r   r   rC    s    



rC  c                      s\   e Zd ZdZd ddZd!d	d
Zd"ddZd#ddZd$ fddZe		d%d&ddZ
  ZS )'InplaceCopyFallbackrD  r  rw   r   r   c                 C  s    |   \}}}|||| d S r   )r  codegen_device_copy)r  r  r  r  non_blockingr   r   r   r    s   zInplaceCopyFallback.codegenr   c                 C  r  r  r   r  r   r   r   r    r  z#InplaceCopyFallback.should_allocater  c                 C  rG  r  rH  r  r   r   r   rD    r   z&InplaceCopyFallback.get_mutation_namesr   c                 C  r  r   r<   r  r   r   r   r    r  z,InplaceCopyFallback.get_unbacked_symbol_defsrg  r6  r~  r  r  r6  c                   sJ   t  jd |||ddd tj|d   tj| | _tj|  d S )Nz
aten.copy_aoti_torch_copy_)r  r  r   )	r  r  rr   r  r  r  r)  r   r*  )r  rg  r~  r  r   r   r   r    s   zInplaceCopyFallback.__init__Fr  r   r  rM  c                   s6    fdd||fD }|f}t t| d||}|S )Nc                   r,  r   r@  r  rA  r   r   r     r   z.InplaceCopyFallback.create.<locals>.<listcomp>r  )rK  r  r'  )r  r  r  rM  r~  r  r  r   rA  r   r    s   zInplaceCopyFallback.creater  rR  rj  r  )rg  r6  r~  r  r  r6  r   r   r_  )r  r   r  r   rM  r   r   rK  )r   r   r   rk  r  r  rD  r  r  r  r  r  r   r   r   r   rK    s    



rK  c                   @  sB   e Zd ZdZdddZdd	d
ZdddZdddZdddZdS )MutatingFirstArgExternKernelrD  r  rw   r   r   c                 C  sX   t | jsJ g dd | jD tt| j}||   dd| d|j  d S )Nc                 s  s    | ]}|  V  qd S r   r  r  r   r   r   r     r  z7MutatingFirstArgExternKernel.codegen.<locals>.<genexpr>r  rt  r5  )	r  r~  r  rE  r  r  r2  r  rF  )r  r  argrefsr   r   r   r    s   
z$MutatingFirstArgExternKernel.codegenr   c                 C  r  r  r   r  r   r   r   r    r  z,MutatingFirstArgExternKernel.should_allocater  c                 C  rG  r  rH  r  r   r   r   rD    r   z/MutatingFirstArgExternKernel.get_mutation_namesr   c                 C  r  r   r<   r  r   r   r   r    r  z5MutatingFirstArgExternKernel.get_unbacked_symbol_defsc                 C  r  r  r   r  r   r   r   has_side_effects  r  z-MutatingFirstArgExternKernel.has_side_effectsNr  rR  rj  r  )	r   r   r   rk  r  r  rD  r  rQ  r   r   r   r   rO    s    




rO  c                      r  )
ResizeStorageBytesvariabler   rq  r   r   r   c                   s   t |ts	J dt jd t| d| |g|fd tj	|
  tj| | _tj|  d| _d| _t |tttfsFJ t|tjj|j
  d S )NzTODO: dynamic shapesr  )r  z"inductor_ops.resize_storage_bytes_z&torch::inductor::resize_storage_bytes_)r   r   r  r  r  r'  r(  rr   r  r  r  r)  r   r*  r  r  r  r  r   r   never_reuse_buffersr  rf  )r  rS  rq  r   r   r   r    s   
zResizeStorageBytes.__init__)rS  r   rq  r   r   r   r  r   r   r   r   rR    r  rR  c                      s(   e Zd Zd fddZdd	d
Z  ZS )SetSourceTensorKernelself_tensorr   storage_tensorr   r   c                   s   |   t j| ||gdtjjjjd t	|t
ttfs$J t|tjj|j  tjj|  tjj|   | }tt|d|| tt|d|| g| _d S )Nz!torch.ops.aten.set_.source_Tensor)r  r  r  )r)  r  r  r#  r)  rp   r  set_source_Tensorr   r  r  r   r   rr   r  rT  r  rf  r  r'  r%  r  r.  )r  rV  rW  r  r   r   r   r  .  s"   

zSetSourceTensorKernel.__init__r  c                 C  s   |  d|  dgS r4  rH  r  r   r   r   rF  B  r   z2SetSourceTensorKernel.get_inputs_that_alias_output)rV  r   rW  r   r   r   rj  )r   r   r   r  rF  r  r   r   r   r   rU  -  s    rU  c                      sR   e Zd ZdZd!ddZd"d	d
Zd#ddZd$ddZdddd% fdd Z  Z	S )&ScatterFallbackz
    This needs to be a custom class to handle mutation properly.
    This class handles both aten.scatter_ and aten.scatter_reduce_.
    It also handle the case `src` being a scalar properly.
    r  rw   r   r   c                 C  r  r   )generate_scatter_fallbackr  r   r   r   r  M  r   zScatterFallback.codegenr   c                 C  r  r  r   r  r   r   r   r  P  r  zScatterFallback.should_allocater   c                 C  s"   | j d }t|tsJ | gS r  r  )r  r  r   r   r   rD  S  s   

z"ScatterFallback.get_mutation_namesr   c                 C  r  r   r<   r  r   r   r   r  X  r  z(ScatterFallback.get_unbacked_symbol_defsNTru  include_selfr  r   r   r   rY  r   r   r  ru  r3  r]  c          
   
     s   t |t _ jr fdd|||fD }|f}	n fdd||fD }||f}	t jd t| d ||	||dt|ddg|d t	j
|  t	j
  _t	j
  d S )	Nc                   r,  r   r@  r  r  r   r   r   j  r   z,ScatterFallback.__init__.<locals>.<listcomp>c                   r,  r   r@  r  r  r   r   r   m  r   r  r\  ru  r]  )r  r  r  )r   r   src_is_tensorr  r  r  r'  r(  r   rr   r  r  r  r)  r   r*  )
r  r  r   rY  r   r  ru  r]  tensorsr  r   r  r   r  [  s&   
zScatterFallback.__init__r  rR  r  r  )r  r   r   r   rY  r   r   r   r  r   ru  r3  r]  r   r   r   rJ  r   r   r   r   rZ  F  s    



rZ  c                      sJ   e Zd ZdZdddZdd	d
ZdddZd ddZd! fddZ  Z	S )"IndexPutFallbackzQ
    This needs to be a custom class to handle mutation and indices properly
    r  rw   r   r   c                 C  r  r   )generate_index_put_fallbackr  r   r   r   r    r   zIndexPutFallback.codegenr   c                 C  r  r  r   r  r   r   r   r    r  z IndexPutFallback.should_allocater  c                 C  rG  r  rH  r  r   r   r   rD    r   z#IndexPutFallback.get_mutation_namesr   c                 C  r  r   r<   r  r   r   r   r    r  z)IndexPutFallback.get_unbacked_symbol_defsr  torch._ops.OpOverloadr   r   rM  r  r   r6  
accumulater   c           	   	     s   | _ dd |D } fdd||g|D }d}t jd t| d ||fd||d tj 	d tj
  _tj  d S )	Nc                 S  r  r   r   r   r   r   r   r     r   z-IndexPutFallback.__init__.<locals>.<listcomp>c                   r,  r   r@  rt  r  r   r   r     r   aoti_torch_index_put_outr  zaten.index_put_)r  r  r  r   )rM  r  r  r  r'  r(  rr   r  r  r  r)  r   r*  )	r  r  r   rM  r   rc  valid_indicesr_  r  r   r  r   r    s    	zIndexPutFallback.__init__r  rR  rj  r  )r  rb  r   r   rM  r  r   r6  rc  r   r   r   rJ  r   r   r   r   r`    s    



r`  c                   @  s$   e Zd Zeddd	ZdddZdS )
DeviceCopyr   r   r  r  rM  r   r   c           	      C  s(  |  }|d us
J | s:t|tjjvr:tdd | D r:tj	j
s:tjjr5tj| tj| ||S tj| tj| td |f}t|}d }| r\| }t|jog|jdkog|}|jdkost|jos|}|rt|rd| _tt|| | ||d| |g|S )Nc                 s  s    | ]	}|t jjv V  qd S r   )rr   r  rH  r?  r   r   r   r     rF  z$DeviceCopy.create.<locals>.<genexpr>zDeviceCopy in input programr9  Trt  )r'  rA  r  rr   r  r  r   r  rE   aot_inductoruse_runtime_constant_foldingr!  add_device_inforC  ra   r  r  r!  rK  ri   r   r"  r#  rk  rf  ri  r&  rM  )	r  r   r  rM  x_devicer  r  is_destination_pinnedis_source_pinnedr   r   r   r    sL   



zDeviceCopy.creater  rw   r   c                 C  s\   |   }t|dksJ | jr||d | j |d  d S ||d |  |d  d S )Nr   r   rD   )r  r   r  rL  r  )r  r  r   r   r   r   r    s   zDeviceCopy.codegenN)r   r   r  r  rM  r   r   r   r  )r   r   r   r  r  r  r   r   r   r   rf    s    /rf  c                      s`   e Zd ZdZdddZd ddZd! fddZd"ddZed 	d#d$ddZ	d%ddZ
  ZS )&DynamicSelectStorageOffseta  
    The result of computing a dynamic selection index is determined as follows: when the index in the
    select operation is unbacked, the actual index calculation is ambiguous for negative indices
    (index + size) versus non-negative indices (just index). To resolve this, we allocate an unbacked
    SymInt to represent the storage offset and decompose the select operation into a call to as_strided,
    computing the storage offset at runtime with this node.
    r   r7  c                 C  r  r   r<   r  r   r   r   r    r  z$DynamicSelectStorageOffset.get_readsr   c                 C  r  r  r   r  r   r   r   r    r  z*DynamicSelectStorageOffset.should_allocateunbacked_offset_symbolsympy.Symbolr   base_offsetsympy.Symbol | intbase_dim_strider  r  r   c                   sD   t  d ttddg  || _|| _|| _|| _|| _	|| _
d S Nr9  r  )r  r  r  r)  r  rn  r   rp  rr  r  r  )r  rn  r   rp  rr  r  r  r   r   r   r    s   	
z#DynamicSelectStorageOffset.__init__r   c                 C     t | jgS r   )r=   rn  r  r   r   r   r  	   r   z3DynamicSelectStorageOffset.get_unbacked_symbol_defsFr  c                 C  r  r   )r&   r   r=  r   r   r   r>     r  z/DynamicSelectStorageOffset.get_free_symbol_usesr  rw   c                 C  s   |j | | jd d S )Nr  )codegen_dynamic_select_indexr  r  r   r   r   r     r   z"DynamicSelectStorageOffset.codegenre  rR  )rn  ro  r   ro  rp  rq  rr  rq  r  rq  r  r   r   r   r  r_  rh  r  r   r   r   rk  r  r  r  r  r]   r>  r  r  r   r   r   r   rm    s    


rm  c                      s`   e Zd ZdZdddZdddZd  fddZd!ddZed 	d"d#ddZ	d$ddZ
  ZS )%DynamicSliceSizeac  
    Computes the output size of a slice call, handling the correct semantics in codegen.
    We do this for flexible handling for unbacked indices (to not data-dependent error).

    Slicing has 4 semantics for indices, i.e. x[start:] could be:
    1) start < -x.size(0)            -> x[0:]                    # negative out-of-bounds
    2) start in [-x.size(0), 0)      -> x[x.size(0) + start:]    # negative slicing
    3) start in [0, x.size(0))       -> x[start:]                # standard slicing
    4) start >= x.size(0)            -> empty slice              # positive out-of-bounds

    If the appropriate semantics are known beforehand, the output size is computed based on
    the start & end indices. If not (with unbacked indices), a new unbacked symbol is created
    to represent the output size, and codegen handles computing the correct case.
    r   r7  c                 C  r  r   r<   r  r   r   r   r  &   r  zDynamicSliceSize.get_readsr   c                 C  r  r  r   r  r   r   r   r  )   r  z DynamicSliceSize.should_allocateunbacked_size_symbolro  r  rq  r  r   r  c                   s>   t  d ttddg  || _|| _|| _|| _|| _	d S rs  )
r  r  r  r)  r  rx  r  r  r   r  )r  rx  r  r  r   r  r   r   r   r  ,   s   
zDynamicSliceSize.__init__r   c                 C  rt  r   )r=   rx  r  r   r   r   r  <   r   z)DynamicSliceSize.get_unbacked_symbol_defsFr  c                 C  s   t | j|t | j|S r   )r&   r  r  r  r=  r   r   r   r>  ?   s   
z%DynamicSliceSize.get_free_symbol_usesr  rw   r   c                 C  r  r   )codegen_dynamic_slice_sizer  r   r   r   r  G   r   zDynamicSliceSize.codegenre  rR  )
rx  ro  r  rq  r  rq  r   rq  r  rq  r  r_  rh  r  rv  r   r   r   r   rw     s    


rw  c                      sJ   e Zd ZdZdddZdddZd fddZdddZdddZ  Z	S )r   z;
    The result of a call to aten._local_scalar_dense.
    r   r7  c                 C  r  r   r<   r  r   r   r   r  P   r  zDynamicScalar.get_readsr   c                 C  r  r  r   r  r   r   r   r  S   r  zDynamicScalar.should_allocatesymro  keypathpytree.KeyPathrf  r   r   c                   s<   |   t d ttdd| |g || _|| _d S rs  )	r	  r  r  r  r)  r  r(  rz  r{  )r  rz  r{  rf  r   r   r   r  V   s   
zDynamicScalar.__init__r   c                 C  rt  r   )r=   rz  r  r   r   r   r  `   r   z&DynamicScalar.get_unbacked_symbol_defsr  rw   c                 C  r  r   )codegen_dynamic_scalarr  r   r   r   r  c   r   zDynamicScalar.codegenre  rR  )rz  ro  r{  r|  rf  r   r   r   r  r  )
r   r   r   rk  r  r  r  r  r  r  r   r   r   r   r   K   s    



r   c                      s`   e Zd ZdZdddZdddZd fddZdddZed 	ddddZ	d ddZ
  ZS )!r   z5
    The result of a call to aten._assert_scalar
    r   r7  c                 C  r  r   r<   r  r   r   r   r  l   r  zAssertScalar.get_readsr   c                 C  r  r  r   r  r   r   r   r  o   r  zAssertScalar.should_allocatescalarrs   r`  r   r   c                   s,   t  d ttddg  || _|| _d S rs  )r  r  r  r)  r  r~  r`  )r  r~  r`  r   r   r   r  r   s   
zAssertScalar.__init__c                 C  r  r  r   r  r   r   r   rQ  }   r  zAssertScalar.has_side_effectsFr  r   c                 C  r  r   )r&   r~  r=  r   r   r   r>     r  z!AssertScalar.get_free_symbol_usesr  rw   c              	   C  s   t jsd S tt| jdd}tjjrd S tjjr;d| d}tjj	j
| jdd}|d| d| j d| d	 d S tjj	j| jdd}|d
| d |dt| j d ||   d d S )NFr  zstd::to_string(r5  )r  zif (!(z()) { throw std::runtime_error("Expected z but received " + z); }zif not (z):z    raise RuntimeError(z = None)rE   scalar_assertsr  r  r>  rr   r  r/  r!  r  codegen_cpp_sizevarr~  r  r`  codegen_python_sizevarrE  r  )r  r  r  
symbol_strsizevarr   r   r   r     s&   zAssertScalar.codegenre  rR  )r~  rs   r`  r   r   r   r_  rh  r  )r   r   r   rk  r  r  r  rQ  r]   r>  r  r  r   r   r   r   r   g   s    


r   c                   @  s   e Zd ZU ded< ded< dS )ExternKernelNoder   r   zexport_schema.Noder   Nr   r   r   r   r   r     s   
 r  c                      s   e Zd ZdZ	d<ddd= fddZd> fddZd?ddZd@ddZdAd!d"Ze	dBd&d'Z
dCd)d*ZdDd,d-ZdDd.d/Zd0d1 Zed?d2d3Ze	dEd7d8ZedFd:d;Z  ZS )Gr\  z
    A class that represents a fallback kernel for handling operators that are not
    directly support by inductor. It currently supports functional ops, view ops,
    inplace aten ops, and mutating ops that are auto-functionalizable.
    Nr  rg  r6  r  r   rU  r  nontensor_argsr6  r?  r  r   r  r  )dict[sympy.Symbol, pytree.KeyPath] | Noner   r   c                  sx  t  j|t|t||d d _|pi  _t|tjjtjj	fs.J d| dt
| d| _| _|d u r:i n| _ jd usDJ tj j g  _g  _t jtjj	r[d S d j v rdd S  jj}tjj jr| j|d   d S ddd}	|jrt js|	 jstd j   j j\}
}d fdd}tjj ||
|D ]	\}}||| qd S )NrI  F#Fails to create FallbackKernel for r4   not supported_c10d_functionalr   rv  rb  r   r   c                 S  s2   t j|  t jjjpt| dot jjj| jv S )N
py_kernels)r)  _C%_dispatch_has_kernel_for_dispatch_keyr   DispatchKeyFunctionalizer  r  ru  r   r   r   has_functionalize_impl   s   
z7FallbackKernel.__init__.<locals>.has_functionalize_implz'NYI: Can't generate FallbackKernel for infotorch._C.Argumentr  r   r   c                   s   t  jtjrt |ttfsJ t|t jr#t |ttfr#J |d u r)d S  jd u r0d S d fdd}t	 jrO|d urK|D ]}|| qDd S d S t jsWJ || d S )Nr,  r   r   r   c                   sL   j |    jd usJ  jjr$jtt|  d|  d S d S r  )	alias_namesr  r  
alias_infois_writer.  r%  r  r'  r,  )r  r  r   r   	add_alias!  s   zPFallbackKernel.__init__.<locals>.handle_aliasing_and_mutation.<locals>.add_alias)r,  r   r   r   )
r   r   r)  ListTyper   r   library_utilsis_tensor_like_typer  is_tensorlist_like_type)r  r  r  optional_tensor_argr  )r  r   handle_aliasing_and_mutation!  s"   

z=FallbackKernel.__init__.<locals>.handle_aliasing_and_mutation)rv  rb  r   r   )r  r  r  r   r   r   )!r  r  r   use_runtime_dispatchr  r   r)  r
  r  r+  r   r  r?  r   r  rr   r  warn_fallbackr  r  r   r  _libraryrz  mutates_and_returns_first_argr  r  
is_mutabler#   r  r~  r  
zip_schema)r  rg  r  rU  r  r?  r   r  schemar  r   r  r  r  r   r  r   r     sV   

	
zFallbackKernel.__init__r5  c                   sH   t   }| jtjjju r"| jD ]}t|t	r!|
t| }q|S r   )r  r6  r  r)  _prims	rng_primsgraphsafe_run_with_rng_stater  r   rD  	with_readrF   r  r  )r  rL  r  r   r   r   r6  *!  s   


zFallbackKernel.get_read_writesr  rw   c                 C  s   | |  | jt| dd S Nr  )(codegen_unbacked_symbol_defs_for_outputsr  r  r   r  r   r   r   codegen_unbacked_symbol_defs6!  s   z+FallbackKernel.codegen_unbacked_symbol_defsr   c                 C  >   t | dd  }rttjjj|}|d usJ t| S t S r  r   r8   rr   r  r  r   r=   r`  r  r  resolvedr   r   r   r  ;!     
z'FallbackKernel.get_unbacked_symbol_defsr   c                   s   t jG dd d t| jsJ  fdd| jD }| || j\}}tjjrDt	| j
tjjrD| ||}dd t| j
jj|D }ndd |D }| j| |S )Nc                   @      e Zd ZU ded< dddZdS )	z)FallbackKernel.codegen_args.<locals>.Shimr   refr   r   c                 S  r  r   )r  r  r   r   r   r  J!  r  z2FallbackKernel.codegen_args.<locals>.Shim.__repr__Nr\  )r   r   r   r   r  r   r   r   r   ShimF!     
 r  c                   s   g | ]} |  qS r   r  rt  r  r   r   r   N!  r   z/FallbackKernel.codegen_args.<locals>.<listcomp>c                 S  s"   g | ]\}}t jj||jqS r   )rr   r  r  r  r  )r   paramr   r   r   r   r   R!  s    c                 S  r  r   r  rt  r   r   r   r   W!  r  )rl  	dataclassr  r~  r?  r  rr   r  r!  r   r  r)  r
  r  r  r   r  r  r   rl  )r  rU  r   r   r   r  r   r  E!  s   zFallbackKernel.codegen_argsSequence[torch.Tensor] | Noner\  r   c                 C  s   | r	dd | D nd }|r| sJ dd | D }|d S t |tjr%|jS t |tjjtfs2t|r7tdS t |tt	frvt
dd |D }dd |D }t|d	krX|d S |s\d S |D ]}t |tjshJ t|jrq|  S q^|d S d S )
Nc                 S  s   g | ]	}t |ts|qS r   )r   r  r  r   r   r   r   b!  r  z.FallbackKernel.find_device.<locals>.<listcomp>c                 S  s   g | ]
}|  r|  qS r   r  r  r   r   r   r   h!  r  r   r9  c                 s  s    | ]	}t d |V  qd S r   )r\  find_devicert  r   r   r   r   q!  s
    

z-FallbackKernel.find_device.<locals>.<genexpr>c                 S  s   g | ]}|r|qS r   r   )r   r  r   r   r   r   w!  r  rD   )r   r)  rZ  r  r  ScriptObjectr'   r)   r   r   r=   r   ri   r   )rU  r\  non_torch_bind_tensor_argsdevices
device_setr  r   r   r   r  ]!  s@   

zFallbackKernel.find_devicer   c                 C  s   ddl m} || jS )Nr   )	is_impure)torch._library.utilsr  r  )r  r  r   r   r   rQ  !  s   
zFallbackKernel.has_side_effectsr  c                 C  sr   t | jtjjtjjfsJ d| j dt| j dt | jtjjs6d| j vr6| jjj	r6t
| jr6g S | jS )Nr  r4  r  r  )r   r  r)  r
  r  r+  r   r   r  r  r#   r  r  r   r   r   rF  !  s    
z+FallbackKernel.get_inputs_that_alias_outputc                 C  s   t | jdks	J | jS r8  )r   r  r  r   r   r   rD  !  s   z!FallbackKernel.get_mutation_namesc                   sf  t d j ttsJ tjj	\}
|}fddjD }j}tjjs=g ||S tdg }|||}ddd t|tjjjrb||d |d j}n|jj}t|dkrjrrjnj}|d j} ||g}	n fddt|jD }	jdusJ t tjj  ||	i dd}
tj!"|
 g ||S )a  
        ProxyExecutor Design Note
        We export the ExternFallbackNodes (for custom ops) into a serialized file
        and run it with a host side proxy executor to address the ABI problem
        This is currently only implemented for fbcode. Eventually, we will also make this work for OSS.
        Detailed design doc can be found at
        https://docs.google.com/document/d/1wC4DOZFaYym2t1Esz0X5yxlLI3RDnSiyRbUus3bkJ64/edit?usp=sharing
        z4Extern kernel node added for node %s with target %s.c                   s   g | ]}j |fi  qS r   r  r2  r   r  r   r   r   !  r  z<FallbackKernel.export_extern_kernel_node.<locals>.<listcomp>Nreturn_type1torch.TensorType | torch.ListType | torch.JitTypert  IRNode | Sequence[IRNode]r   export_schema.Argumentc                 S  sn  t | tjtjfrE|}t |ttfrt|dksJ |d }t | tjr8t |ts+J tj	j
tj| ddS |d u s>J tj	j
ddS t | tjrjt |  tjrjt |ts^J t|tj	j
dd |D d	S t | tjrt |  tjr|d u rtj	j
tjj
ddd
S t |tsJ tj	j
tjj
tj| ddd
S t | tjrtj	j
|dS tdt|  )NrD   r   r   )	as_tensorT)as_nonec                 S  s   g | ]
}t j| d qS )r   )export_schemaTensorArgumentr  )r   r  r   r   r   r   !  s    zZFallbackKernel.export_extern_kernel_node.<locals>.handle_single_output.<locals>.<listcomp>)
as_tensors)as_optional_tensor)as_intzUnsupported return type )r   r)  
TensorTypeNoneTyper   r   r   r   r  rt   r  r  r  r  getElementTyper   r   OptionalTypeOptionalTensorArgumentIntTypeRuntimeError)r  rt  r  r   r   r   handle_single_output!  sR   

zFFallbackKernel.export_extern_kernel_node.<locals>.handle_single_outputr   rD   c                   s   g | ]
\}} |j |qS r   )r  )r   return_schemart  )r  r   r   r   "  s    )r  r~  r  r  )r   r   )r  r  rt  r  r   r  )#r\  r  r  r  r   r\  r   r?  r~  r  r  r  rr   r  aot_moder"   serialize_inputsr)  rM  	torchbindCallTorchBindr  returnsr  r   r  r.  r  r   r  r  r;   r   extern_kernel_nodesr  )r  r   ordered_kwargsr  
serializernamed_argumentsr  r  r  output_argumentsr   r   )r  r   r  r   export_extern_kernel_node!  sN   	

5



z(FallbackKernel.export_extern_kernel_nodec                   s  j }|dus	J |jdkr5t|tjjsJ t|tjj	r4ddl
m} t||vr4td| d_n|jdkrHt|tjjsGJ t|ntjj	rS|tjjv_tjj	rt|tjjrjsd fdd jj\}t|fddjD }t fddt||jjD _| jrƈ }jdusJ j dusJ | ! jfddj |j"rj"nj# n|$ tj%t&r'| (| )| *| dS )r  Nr  r   r  zG%s is missing a c-shim implementation, using proxy executor as fallbackT
_quantizedr,  torch.JitTyper   r   c                   s$   t | tjr |  S t | tjS r   )r   r)  r  r  
NumberTyper  	is_numberr   r   r  C"  s   z)FallbackKernel.codegen.<locals>.is_numberc                 3  s"    | ]}j |fi  V  qd S r   r  r  r  r   r   r   P"  r  z)FallbackKernel.codegen.<locals>.<genexpr>c                 3  s(    | ]\}}t |to |jV  qd S r   )r   complexr  )r   r  r  r  r   r   r   U"  s
    
c                     s   g      S r   )r  r  r   r  r   r   r@  c"  r  z(FallbackKernel.codegen.<locals>.<lambda>)r,  r  r   r   )+r  r"  r   r)  r
  r  r   rr   r  r!  r%  r  r   r\  r  r  rE   rg  custom_ops_to_c_shimsr?  r~  r  rv  r?  r  r  r   r  r  r  r  r  ,generate_fallback_kernel_with_runtime_lookupr  r  r.  generate_fallback_kernelrg  r  r  r  r  r  )r  r  r  r  r   	args_iterexported_argsr   )r  r   r  r   r  "  sl   










zFallbackKernel.codegenrt  r  ri  c                 C  sJ   d}z|   }W n	 ty   Y nw t| j| jt|  t|  |dS )NFrt  )rk  r  ri  r  r  r_   r  r  )rt  rk  r   r   r   r]  r"  s   

zFallbackKernel.tensor_to_layoutr   c                   s\  t jf}||vrttd tjj}nt }|  j|g|R i |\}}}}	}
W d   n1 s3w   Y  t	|t
jjrt	|t
jrddlm}m}m}m} d}||jr\||}|du rd||}|durt||dkrt|j|jg |jg | d}t|t|t||t||dS tdd |D  ||st	|t
jj j!s|t
j"j#j$u rt
d	t	|t
jjrtjj%s׈rt&j'|||||	||
d
	}|dur|S |du r t(d||||	||
dnsJ d t)d||||	||
dd fdd|g }t	|tt*fr|_+|S t	|t,r(t*|_+|S |g_+|S )z9Create an instance of FallbackKernel from an _OpOverloadsNr   )_is_functionalget_out_arg_nameslookup_manual_out_variantto_out_variantrD   r  r  r  r  )rg  r~  r  r   r  r  c                 s  r  r   )rU  r  r   r   r   r   "  r  z(FallbackKernel.create.<locals>.<genexpr>r9  r  has_unaligned_inputr  r   r  z"Not sure where to find device infort  r   rM  list[tuple[Any, int]]r   c                   s  t ttfrt fddttD S t tr, fdd D S t tj	rNt
 }tjsDsDtsLtjj|j |S t trUS t tjr_jjS t tjjtfsltrvttd S d u sJ dt dd S )Nc                 3  s,    | ]} | t |fg V  qd S r   r   r   generate_outputrM  rt  r   r   r   "  s
    
zAFallbackKernel.create.<locals>.generate_output.<locals>.<genexpr>c                   s*   i | ]\}}| |t |fg qS r   r  )r   rC  r  r  r   r   r   "  r  zBFallbackKernel.create.<locals>.generate_output.<locals>.<dictcomp>r  zFallbackKernel output type z is not supported)r   r   r   r   r   r   r   r  r)  rZ  rL  r]  rE    assume_unaligned_fallback_outputro   rr   r  rW  r  r   r   rv  r   r  r  r  r'   r)   r   r  )rt  rM  rO  r  r  r  r  packed)rM  rt  r   r  "  sR   



z.FallbackKernel.create.<locals>.generate_output)rt  r   rM  r  r   r   )-r  *_fused_moving_avg_obs_fq_helper_functionalr   r   rr   r  r#  r	   ra  r   r)  r
  r  rZ  torch._library._out_variantr  r  r  r  r  r   ri  r  r  rW  r  r  r   _make_out_variant_kernel_namer  r  rM  r  r  rp   higher_orderprintr!  ExternKernelMultiOut
try_creater  r2  r   r  r   )r  r  r   r   fake_incorrect_kernelscontextr\  rU  rV  r?  r  r  r  r  r  out_oprg  
out_resultr  r   r  r   r  "  s   





+
zFallbackKernel.creater   rg  r6  r  r   rU  r  r  r6  r?  r  r   r  r  r  r   r   rd  r  r  r  )rU  r  r\  r6  r   r   rR  rj  )rt  r  r   ri  )r  r   r   r   r   r   r   r\  )r   r   r   rk  r  r6  r  r  r  rn  r  rQ  rF  rD  r  r   r  r]  r  r  r  r   r   r   r   r\     s,    	w




%

yUr\  c                      s>   e Zd ZdZdddZdddZd	d	d
d fddZ  ZS )ComplexViewz9View a complex number as two dtyped numbers or vice versar   r   c                 C  r  r  r   r  r   r   r   r  '#  r  zComplexView.should_allocater  c                 C  rG  r  rH  r  r   r   r   rF  *#  r[  z(ComplexView.get_inputs_that_alias_outputNr  rg  r6  r  r   rU  r  r  r6  r?  r  r   r  r  r  r   c             	     s   t  j|||||||d d S )Nr  )r  r  r  rg  r  rU  r  r?  r   r  r   r   r   r  .#  s   
zComplexView.__init__rR  rj  r	  )r   r   r   rk  r  rF  r  r  r   r   r   r   r	  ##  s    

r	  c                   @  s   e Zd ZdZd	ddZdS )
MemoryCheckKernelz
    Custom kernel for memory checking that generates direct function calls

    TODO - the custom op was erroring with str inputs. should be able to custom op directly.
    r  rw   r   r   c                 C  sl   |   | j\}}}t|}t|}|r&|d d| d| d| d}n	d| d| d}|| dS )z.Override codegen to write direct function callzV# note: dont currently distinguish between buffers returned and dealloc'd in last stepzcheck_memory_step(allocated=z, freed=z, is_final_step=r5  N)r  r  rE  r  )r  r  
alive_list	dead_listis_final_step
alive_repr	dead_reprcallr   r   r   r  K#  s   zMemoryCheckKernel.codegenNr  )r   r   r   rk  r  r   r   r   r   r	  D#  s    r	  c                   @  r  )	r2  r  r  r   r  c                 C  r  r   r  r  r   r   r   r'  a#  r  zMultiOutputLayout.get_deviceNrW  )r   r   r   r   r'  r   r   r   r   r2  ]#  r  r2  c                      s`   e Zd ZdddZ	dd  fddZed 	dd!ddZd"ddZd#ddZd$ddZ	  Z
S )%rL  r  rw   r   r   c                 C  s,   | |  | js| | | | d S d S r   )codegen_multi_output!skip_size_stride_alignment_checksr  r  r  r   r   r   r  f#  s
   

zMultiOutput.codegenFrg  r6  r  r   rM  list[tuple[Any, ...]]r	  r   c                   s>   t  d ||gd tj| | _tj|  || _|| _d S r  )	r  r  rr   r  r)  r   r*  rM  r	  )r  rg  r  rM  r	  r   r   r   r  l#  s
   
zMultiOutput.__init__r  r   c                 C  s&   | j d }t|tsJ |||S r  )r~  r   r   r>  )r  r  r1  r   r   r   r>  y#  s   

z MultiOutput.get_free_symbol_usesc                 C  s   t | jdkot| jd tS )NrD   r   )r   r~  r   r  r  r   r   r   r  #  s   zMultiOutput.should_allocater  c                 C  s   dd | j D S )Nc                 S  s.   g | ]}t |trt| d kr| qS r  )r   r\  r   rF  r  r   r  r   r   r   r   #  s    z<MultiOutput.get_inputs_that_alias_output.<locals>.<listcomp>)r~  r  r   r   r   rF  #  s   z(MultiOutput.get_inputs_that_alias_outputr5  c                   s   t  }| jD ]}t|tr|t|  q|  |  	  d fdd}| 
 }tj p;|d u p;t|j }tj||  d|d	}tj||jt  d
S )Nr   r6  r*  r   r   c                   r>  r?  rA  r  rB  r   r   r9  #  rC  z*MultiOutput.get_read_writes.<locals>.dummyr   r3  r  r:  )r=   r~  r   r   r  rF   r  r  r#  r  r'  rE   r  ri   r   rO   r!  r  r  )r  r:  r  r9  r  should_normalizewrite_rwr   rB  r   r6  #  s,   


zMultiOutput.get_read_writesr  r_  )
rg  r6  r  r   rM  r	  r	  r   r   r   rh  rR  rj  rd  )r   r   r   r  r  r]   r>  r  rF  r6  r  r   r   r   r   rL  e#  s    


rL  c                      sD   e Zd ZdZd fddZedddZdddZdddZ  Z	S )r   zMultiOutput for opaque objects.rg  r6  r  r   rM  r	  opaque_valuer   r   r   c                   s   t  j|||dd || _d S )NTr	  )r  r  rK  )r  rg  r  rM  r	  r   r   r   r  #  s   
zOpaqueMultiOutput.__init__r   c                 C  r  )NzOpaqueMultiOutput has no dtype)r{  r  r   r   r   r  #  r  zOpaqueMultiOutput.dtypec                 C  r  r   r   r  r   r   r   r  #  r  z#OpaqueMultiOutput.wrap_for_loweringr5  c                 C  sV   t  }| jD ]}t|tr|t|  qt t|  g}tj||t  dS )Nr  )	r=   r~  r   r   r  rF   r  r  r  )r  r:  r  r  r   r   r   r6  #  s   

z!OpaqueMultiOutput.get_read_writes)
rg  r6  r  r   rM  r	  r	  r   r   r   )r   r   )r   r   rd  )
r   r   r   rk  r  rq  r  r  r6  r  r   r   r   r   r   #  s    

r   c                   @  s$   e Zd ZdZdddZdd	d
ZdS )AllocatingMultiOutputzMultiOutput with Inductor-controlled allocation for .out() variant ops.

    Overrides should_allocate()=True so Inductor allocates the output buffer,
    and skips tuple-indexing codegen since .out() writes directly into these buffers.
    r   r   c                 C  r  r  r   r  r   r   r   r  #  r  z%AllocatingMultiOutput.should_allocater  rw   r   c                 C  s"   | j s| | | | d S d S r   )r	  r  r  r  r   r   r   r  #  s   
zAllocatingMultiOutput.codegenNrR  r  )r   r   r   rk  r  r  r   r   r   r   r	  #  s    
r	  r	  rb  c                 C  s4   | j }| jjdd }| j}d| d| d| S )z8Build fully-qualified kernel name for an out-variant op.z::rD   z
torch.ops.r  )r"  r  r   r  r#  )r	  nsr  r   r   r   r   r  #  s   r  c                      sT   e Zd ZU dZded< ded< d( fddZd)ddZedddd*d&d'Z  Z	S )+r  zMulti-output .out() variant lowering.

    Subclass of FallbackKernel that emits .out() calls with pre-allocated
    output buffers. Uses AllocatingMultiOutput child nodes for each output.
    r   out_arg_nameszlist[AllocatingMultiOutput]out_variant_output_nodesr   r   r	  rb  r   r   r   c                  s2   t  j|i | || _g | _t|| _|| _d S r   )r  r  r	  r	  r  r  r  )r  r	  r	  r   r   r   r   r   r  #  s
   

zExternKernelMultiOut.__init__r  rw   c                 C  s   |  | ||  d S r   )r   generate_extern_kernel_multi_outr  r   r   r   r  $  r  zExternKernelMultiOut.codegenNFr  r  r\  r  r  rU  r  rV  r6  r?  r  r  r  r  r  r   &Sequence[AllocatingMultiOutput] | Nonec                C  s<  ddl m}
m}m} |
|jsdS t|ttfsdS ||}|du r$dS ||}tdd |D s3dS t	|t	|kr=dS | t
|d||||||||d	}g }t|D ]7\}}t|j|jg |jg | d}t||t||fgd	}tjs}|	s}t|stjj|j || qS||_||_t|trt|S t|S )
zGCreate an ExternKernelMultiOut if the op has a matching .out() variant.r   )r  r  r  Nc                 s  s    | ]	}t |tjV  qd S r   )r   r)  rZ  r  r   r   r   r   '$  rF  z2ExternKernelMultiOut.try_create.<locals>.<genexpr>r  )r   r  r	  r	  r  )rg  r  rM  )r  r  r  r  r  r   r   r   r   r   r2  r  ri  r  r  rW  r  r	  r   rE   r  ro   rr   r  rW  r  r   r  r	  r  )r  r  r\  r  rU  rV  r?  r   r  r  r  r  r  r	  r	  r  r  r   
tensor_outrg  	multi_outr   r   r   r 	  $  sd   


zExternKernelMultiOut.try_create)
r   r   r	  rb  r	  r   r   r   r   r   r  )r  rb  r\  r   r  r  rU  r  rV  r6  r?  r  r   r  r  r  r  r   r   r	  )
r   r   r   rk  r   r  r  r  r 	  r  r   r   r   r   r  #  s   
 
r  c                   @  s  e Zd ZU dZded< dvddZdwd	d
ZdxddZdyddZdzddZ	d{ddZ
d|d}ddZd~dd Zdd!d"Zdd#d$Zdvd%d&Zdd'd(Z	)ddd-d.Zdd/d0Zdd2d3Z	)ddd5d6Zdd8d9Zdd;d<Zdd=d>Zdd@dAZddCdDZddFdGZdvdHdIZdvdJdKZddNdOZddQdRZd{dSdTZddUdVZ ddWdXZ!e"d 	)ddd[d\Z#dd^d_Z$ddadbZ%d|ddedfZ&e'ddhdiZ(ddkdlZ)ddmdnZ*ddodpZ+e'ddrdsZ,d{dtduZ-e-Z.dS )r  zC
    TensorBox / StorageBox allow in-place mutation of Tensors
    r   rf  r   r   c                 C  rd  r   ri  r  r   r   r   r  _$  r  z!MutableBox.has_exceeded_max_readsr  c                 C  rd  r   rf  r  r   r   r   r'  b$  r  zMutableBox.get_devicer  c                 C  rd  r   r  r  r   r   r   r  e$  r  zMutableBox.make_loaderr  c                 C  rd  r   )rf  r  r  r   r   r   r  h$  r  zMutableBox.make_indexerrU  c                 C  rd  r   )rf  rK  r  r   r   r   rK  k$  r  zMutableBox.get_strider   c                 C  rd  r   rg  r  r   r   r   r  n$  r  zMutableBox.get_nameNr"  r  c                 C  rY  r   )rf  r$  r#  r   r   r   r$  q$  r   zMutableBox.has_large_inner_fnr%  r   r   c                 C  rY  r   rh  r&  r   r   r   r'  t$  r   zMutableBox.mark_reusec                 C  rd  r   rk  r  r   r   r   r(  w$  r  zMutableBox.realize_hintc                 C  rd  r   )rf  r  r  r   r   r   r  z$  r  zMutableBox.unwrap_viewc                 C  rd  r   )rf  r!  r  r   r   r   r!  }$  r  zMutableBox.is_input_bufferc                 C  rd  r   )rf  r)  r  r   r   r   r)  $  r  zMutableBox.freeze_layoutFr   r   r*  c                 C     | j ||S r   )rf  r,  r+  r   r   r   r,  $  r-  z*MutableBox.freeze_layout_with_stride_orderc                 C  rY  r   )rf  r/  r.  r   r   r   r/  $  r   z(MutableBox.freeze_layout_with_fill_orderr  c                 C  rY  r   )rf  r1  r0  r   r   r   r1  $  r   z(MutableBox.freeze_layout_with_same_orderr2  c                 C  r	  r   )rf  r4  r3  r   r   r   r4  $  r-  z+MutableBox.freeze_layout_with_exact_stridesr5  c                 C  rd  r   )rf  r6  r  r   r   r   r6  $  r  zMutableBox.get_read_writesr7  c                 C  rd  r   r  r  r   r   r   r  $  r  zMutableBox.get_readsc                 C  rd  r   r  r  r   r   r   r;  $  r  zMutableBox.num_readsr~   c                 C  rd  r   rl  r  r   r   r   r<  $  r  zMutableBox.get_storage_numelr3  c                 C  rd  r   r  r  r   r   r   r?  $  r  zMutableBox.get_reduction_typer  c                 C  rd  r   r  r  r   r   r   r@  $  r  zMutableBox.get_reduction_sizec                 C  rd  r   rm  r  r   r   r   rA  $  r  zMutableBox.is_externc                 C  rd  r   )rf  rB  r  r   r   r   rB  $  r  zMutableBox.is_no_opr  r  c                 C  rY  r   r  r  r   r   r   rC  $  r   zMutableBox.constant_to_devicer  c                 C  rd  r   )rf  rD  r  r   r   r   rD  $  r  zMutableBox.get_mutation_namesc                 C  rd  r   )rf  rE  r  r   r   r   rE  $  r  zMutableBox.get_operation_namec                 C  rd  r   )rf  rF  r  r   r   r   rF  $  r  z'MutableBox.get_inputs_that_alias_outputc                 C  rd  r   rj  r  r   r   r   r	  $  r  zMutableBox.realizer  r   c                 C  rY  r   rZ  r=  r   r   r   r>  $  r  zMutableBox.get_free_symbol_usesr  c                 C  rd  r   ro  r  r   r   r   r  $  r  zMutableBox.get_read_namesr  c                 C  rd  r   )rf  r  r  r   r   r   r  $  r  zMutableBox.get_defining_opr
  r  c                 C  rY  r   )rf  r  r  r   r   r   r  $  r   zMutableBox.codegen_referencer6  c                 C  rd  r   rf  r  r  r   r   r   rg  $  s   
zMutableBox.layoutr  c                 C  rd  r   re  r  r   r   r   r#  $  r  zMutableBox.get_layoutc                 C  rd  r   r 	  r  r   r   r   r  $  r  zMutableBox.get_output_specc                 C  rd  r   r  r  r   r   r   r!  $  r  zMutableBox.get_sizer  c                 C  r  r   )rf  r  r  r   r   r   r  $  r  zMutableBox.dtypec                 C  sn   t | jtrt| j dt| jj d}d}| jj}nt| j d}| j}d}|tt||g}d|S )Nr  z))r5  r  )r   rf  r  r   r   r  r   r  )r  line0endlr`  r  r   r   r   r  $  s   


zMutableBox.__str__rR  rW  rY  rZ  r[  r\  r   r]  r^  rI  rH  r_  r`  ra  rb  rc  rd  re  rf  rg  rU  rS  ri  rj  rh  rJ  rL  rV  rQ  rP  rO  )/r   r   r   rk  r   r  r'  r  r  rK  r  r$  r'  r(  r  r!  r)  r,  r/  r1  r4  r6  r  r;  r<  r?  r@  rA  rB  rC  rD  rE  rF  r	  r]   r>  r  r  r  rq  rg  r#  r  r!  r  r  r  r   r   r   r   r  W$  sb   
 































r  c                   @  s>   e Zd Zeed
ddZeedddZedddZd	S )r   rf  r   r   c                 C  r  r   r   rf  r   r   r   r  $  r  zTensorBox.creater   c                 C  r  r   r   r#	  r   r   r   r  $  r  c                 C  s   t | tr| S tt| S r   )r   r   r   r  r#	  r   r   r   r  $  s   
N)rf  r   r   r   )rf  r   r   r   )rf  r   )r   r   r   r   rn  r  r   r   r   r   r   $  s    c                   @  sj   e Zd ZdZdddZdddZdd	d
ZdddZdddZdddZ	d ddZ
d!ddZd"ddZdS )#r  z7
    StorageBox allow in-place mutation of Tensors
    r   r   c                 C  s&   t | jttfr| j tjjv S dS r  )r   rf  r  rl  r  rr   r  r   r  r   r   r   r!  %  s   zStorageBox.is_input_bufferc                 C  s   t | jto| j tjjv S r   )r   rf  r  r  rr   r  rH  r  r   r   r   rn  %  s   zStorageBox.is_module_bufferr3  c                 C  s   t | jr| j S t| jttttfsJ t	| j| j
 }| j }| j }|d us1J td t|| j | j dd| jd| _tj| j| j_tj| j | j| j_|| j_|| j_| jjj| j_| jjS )NF)r  r  r  rk  r=  )r   r  rf  r  r   r  r  r  r7  r   r  r  r'  r  r$  r&  r!  rr   r  r)  r   r*  r  r  r  r  )r  r  r  r  r   r   r   r	  %  s4   





zStorageBox.realizer   c                 C  s4   t | jttfr| j jdkr|   dS dS dS )zL
        Called on buffers we expect to be forced to realize later.
        rD   N)r   rf  r  r  r  nontrivial_read_countr	  r  r   r   r   r(  ,%  s   zStorageBox.realize_hintr"  r   c                   s^   ddl m   fdd|  D }|sdS t|}t|}t|}||ko.|| dko.||kS )Nr   is_nonfreeable_buffersc                   s    g | ]} |st j|qS r   )rr   r  get_dep_size_hintr  r%	  r   r   r   9%  s    
zCStorageBox.has_accumulated_enough_reads_by_size.<locals>.<listcomp>Fr   )r  r&	  r  r  r  r  )r  r"  size_of_reads
total_sizemax_sizemin_sizer   r%	  r   $has_accumulated_enough_reads_by_size6%  s   

z/StorageBox.has_accumulated_enough_reads_by_sizec                 C  s8   t | jto|  tjkp|  ptjd uo| tjS r   )	r   rf  r  r;  rE   realize_acc_reads_thresholdr$   realize_acc_reads_size_thresholdr,	  r  r   r   r   r  I%  s   
z!StorageBox.has_exceeded_max_readsr%  c                   sh   |dkr2t | jttfr2t| jr'| j  ddg}t fdd|D r'dS |  tj	kp1| 
 S dS )zj
        A heuristic to decide if we should realize a tensor
        that is used multiple times.
        rD   expsigmoidc                 3  s    | ]}| j v V  qd S r   )used_opsrt  opcountr   r   r   _%  r  z5StorageBox.should_realize_on_reuse.<locals>.<genexpr>TF)r   rf  r  r  rC  r  r  r;  rE   realize_reads_thresholdr$  )r  r%  	heavy_opsr   r2	  r   should_realize_on_reuseU%  s   

z"StorageBox.should_realize_on_reusec                 C  s   |  |r|   d S d S r   )r6	  r	  r&  r   r   r   r'  g%  s   
zStorageBox.mark_reusec                 C  rd  r   r  r  r   r   r   r;  k%  r  zStorageBox.num_readsNrR  rU  rI  )r"  r   r   r   )r%  r   r   r   r^  rf  )r   r   r   rk  r!  rn  r	  r(  r,	  r  r6	  r'  r;  r   r   r   r   r  $  s    








r  c                   @  s*   e Zd ZU ded< ded< dZded< dS )Subgraphr   r   rs  graph_moduleNzGraphLowering | Noner  )r   r   r   r   r  r   r   r   r   r7	  o%  s   
 r7	  buffersc                 C  s,   dd | D } t tdd | D t | k S )Nc                 S  "   g | ]}t |tr| n|qS r   r   rl  r  r   rR  r   r   r   r   w%      z(_has_aliased_buffers.<locals>.<listcomp>c                 s  r  r   )r[  r<	  r   r   r   r   |%  r  z'_has_aliased_buffers.<locals>.<genexpr>)r   r=   )r9	  r   r   r   _has_aliased_buffersv%  s   r>	  c                      sj   e Zd ZU dZdZded< dZded< dZded< d fddZdddZ	e
dddZdddZ  ZS )InvokeSubgraphz.
    Ir node for the invoke_subgraph HOP.
    NSubgraph | Noner  Sequence[IRNode] | Noneoperandsr  r7	  r  rg  r2  r   r   c                   s6   t  jd ||d || _tj| | _tj|  d S r   )r  r  r  rr   r  r)  r   r*  )r  r  rB	  rg  r   r   r   r  %  s   zInvokeSubgraph.__init__r  c                 C  s   | j r| j gS g S r   )r  r  r   r   r   r  %  r  zInvokeSubgraph.get_subgraphsr   @list[ShapeAsConstantBuffer | NoneAsConstantBuffer | MultiOutput]c                   s  ddl m} tjj}d}|jd }r5d}|jtj	j
ju r,|jd tj	j
ju s*J d}|d |d }n&d}|jtj	j
ju rM|jd tj	j
ju sKJ d}|j|d }d	d
 |D } fdd
|D }g }	t|D ]\}
}t|tttfr||	| qj|	||||
  qj|	}|jdu rtjj|j||jd|_t|j |jj|  W d   n1 sw   Y  |jj}d}|D ]}t|ts| } nq|dusJ t||t|dddfddfdd
t|D }|_|S )zFor each operand, get a realized input, force it to have the same
        strides as the subgraph inputs, then use an InvokeSubgraphrD   )constrain_to_fake_tensorNeager_input_valsr   r   r   r   c                 S     g | ]}|j d  qS r  rz  rt  r   r   r   r   %  r   z)InvokeSubgraph.create.<locals>.<listcomp>c                   r,  r   r@  rt  rA  r   r   r   %  r   rr  r  r  r  )r  rB	  rg  rt  r   indr   r   :ShapeAsConstantBuffer | NoneAsConstantBuffer | MultiOutputc              	     sd   t | ttfr	| S |  }|d usJ tt||  |  |  | 	 j
| 	 jd t|fgddS )Nr  Tr	  )r   r   r  r'  rL  ri  r&  r!  rK  r#  rj  rk  r   )rt  rI	  r  )invoke_subgraphr   r   create_output%  s"   z,InvokeSubgraph.create.<locals>.create_outputc                   s   g | ]	\}} ||qS r   r   )r   r   rt  )rL	  r   r   r   %  r  )rt  r   rI	  r   r   rJ	  )ro  rD	  rr   r  r  rz  r  r  r)  rp   r  rO  r   rK	  r  r   r   rD  rG  r  r  r8	  r   r  r  graph_outputsr'  r?	  r2  r  )r  r  rB	  rD	  r  fake_operandsrE	  rj  fx_operandsnew_operandsr   operandr  r  outsr   )r  rL	  rK	  r   r  %  sh   


zInvokeSubgraph.creater  rw   c                 C  r  r   )codegen_invoke_subgraphr  r   r   r   r  &  r   zInvokeSubgraph.codegen)r  r7	  rB	  r  rg  r2  r   r   rM  )r  r7	  rB	  r   r   rC	  r  )r   r   r   rk  r  r   rB	  r  r  r  r  r  r  r  r   r   r   r   r?	  %  s   
 
hr?	  c                      s   e Zd ZU dZdZded< dZded< dZded< dZded	< dZ	d
ed< d- fddZ
d.ddZed/ddZed0d$d%Zd1d(d)Zd2d+d,Z  ZS )3Conditionala  
    IR node representing torch.cond

    Attributes:
        predicate: A boolean scalar tensor determining which branch to execute.
        operands: Input tensors passed to both true and false subgraphs.
        true_subgraph: Subgraph executed when predicate is True.
        false_subgraph: Subgraph executed when predicate is False.
        outputs: MultiOutput nodes representing the conditional's outputs.
    Nr  	predicaterA	  rB	  r@	  true_subgraphfalse_subgraphSequence[MultiOutput] | Noner  r   r  r7	  rg  r2  r  r  r   r   c           	        sj   || _ || _|| _|| _t|g|\}}t jd |||d |d ur&|| _tj	
| | _tj	|  d S N)r   rg  r~  r  )rU	  rB	  rV	  rW	  _split_by_sym_typer  r  r  rr   r  r)  r   r*  )	r  rU	  rB	  rV	  rW	  rg  r  sym_argsrU  r   r   r   r  &  s   	zConditional.__init__r  c                 C  ,   g }| j r|| j  | jr|| j |S r   )rV	  r  rW	  r  	subgraphsr   r   r   r  4&     zConditional.get_subgraphsr   int | torch.SymIntint | sympy.Exprc                 C  s   t | tr| S | jjS r   )r   r   r   r  )r   r   r   r   _maybe_expr<&  s   
zConditional._maybe_exprr   true_fnfalse_fnrY  list[MultiOutput]c              	     s    |} fddD tjjjd }t|ts J t|g }|D ]}t|tr4|	|j
d  q$|	| q$tjjj
d }ddd}	||fD ]8}
|
jdu rtjj|
j||
jd|
_t|
j |
jj|  |	|
jj||
j_W d   n1 s}w   Y  qJ|jdusJ |jdusJ |jj}|jj}d|fd|ffD ]\}}t|rtd| d| qt|t|ksJ ||ftt||D ]5\}\}}| | ksJ |||f| | ksJ |||f| j| jksJ |||fqtdd |g D ttjjjtjjj
dd}dus#J dt|||t d|dfddtt|tjjj
d D }|_!ddl"m#} ||j\}}}}}||j\}}}}}t$|t$|B }fddt%|D _&|S )zNCreate a Sequence of IRNodes from a conditional statement (see .lowering.cond)c                   r,  r   r@  rt  rA  r   r   r   N&  r   z&Conditional.create.<locals>.<listcomp>rI  r  rM	  r  fake_tensorsSequence[torch.Tensor]r   r   c                 S  sP   g }t | |D ]\}}t|tr|| q|tjt|| dd q|S NFrQ  )r   r   r   r  r  rn  r   r  )rM	  rf	  retrt  r@  r   r   r   _require_exact_strides^&  s   
z2Conditional.create.<locals>._require_exact_stridesNrH	  rc	  rd	  zVOutput aliasing is currently not supported in compiled torch.cond. The outputs of the z% subgraph of torch.cond are aliased: c                 s  s"    | ]}t |ts| V  qd S r   )r   r   r'  )r   or   r   r   r   &  s    
z%Conditional.create.<locals>.<genexpr>r  zcannot determine devicer  )rU	  rB	  rV	  rW	  rg  r  c                   sv   g | ]7\}\}}t t| d ur| n| dd | D dd | D | j| jd t	|fgqS )Nc                 S  rz  r   rT	  rb	  r   r  r   r   r   r   &  r   z1Conditional.create.<locals>.<listcomp>.<listcomp>c                 S  rz  r   rl	  rm	  r   r   r   r   &  s    
r  )
rL  ri  r'  r&  r  r  r#  rj  rk  r   )r   r   rt  merged_output)conditionalr  r   r   r   &  s&    
r   )-check_input_alias_and_mutation_return_outputsc                   s"   g | ]}t | j|  qS r   )r%  rg  r   r   )ro	  rB	  r   r   r   &  r=	  )rM	  r  rf	  rg	  r   r   )'rM  rr   r  r  r   r   r   r   r;   r  rz  r  r8	  r   r  r  rM	  r>	  r  r   r  r   r'  r&  r#  rj  r  r8   r  r   r  rT	  r2  r  torch._higher_order_ops.utilsrp	  r=   rH  r.  )r  rU	  rc	  rd	  rB	  rO	  rN	  fx_opfake_outputsrj	  r  true_outputsfalse_outputsr   r  r   t_of_or  rp	  r
  true_mutated_inputsfalse_mutated_inputsmutated_operand_indicesr   )r  ro	  r  rB	  r   r  B&  s   





	$	zConditional.creater  rw   c                 C  s*   | |  ||  | jt| di  d S r  )codegen_conditionalr  r  r  r   r  r   r   r   r  &  s   
zConditional.codegenr   c                 C  r  r  r  r  r   r   r   r  &  r  z$Conditional.get_unbacked_symbol_defs)rU	  r   rB	  r  rV	  r7	  rW	  r7	  rg  r2  r  r  r   r   rM  )r   r`	  r   ra	  )
rU	  r   rc	  r7	  rd	  r7	  rB	  rY  r   re	  r  r  )r   r   r   rk  rU	  r   rB	  rV	  rW	  r  r  r  rn  rb	  r  r  r  r  r  r   r   r   r   rT	  &  s    
 
 
rT	  r   r  -tuple[list[ShapeAsConstantBuffer], list[Any]]c                 C  s<   g }g }| D ]}t |tr||j q|| q||fS r   )r   r   r  r  )r   non_sym_argsr[	  r  r   r   r   rZ	  &  s   
rZ	  c                      s   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	d	ed
< d, fddZ
d-ddZed.ddZed/ddZed0d#d$Zd1d'd(Zd2d*d+Z  ZS )3	WhileLoopzSThe IR node for while_loop and while_loop_stack_output. It supports input mutation.NrA	  carried_inputsadditional_inputsr@	  cond_subgraphbody_subgraphrX	  r  r  r7	  rg  r2  r  r  stack_outputr   r   r   c           
        sr   || _ || _|| _|| _tg ||\}}	t jd ||	|d |d ur'|| _|| _t	j
| | _t	j
|  d S rY	  )r	  r	  r	  r	  rZ	  r  r  r  r	  rr   r  r)  r   r*  )
r  r	  r	  r	  r	  rg  r  r	  r[	  rU  r   r   r   r  '  s$   

zWhileLoop.__init__r  c                 C  r\	  r   )r	  r  r	  r]	  r   r   r   r  %'  r_	  zWhileLoop.get_subgraphsc                 C  sr   t | s| S dd | D }t }g }t| |D ]\}}t||v r*|t| q|t| || q|S )Nc                 S  r:	  r   r;	  r<	  r   r   r   r   8'  r=	  z3WhileLoop._clone_aliased_inputs.<locals>.<listcomp>)r>	  r=   r   r[  r  r  r3  r  )r	  unwrapped_buffersseen_buffersr  original_inputunwrapped_bufferr   r   r   _clone_aliased_inputs0'  s   zWhileLoop._clone_aliased_inputsr  r   c                 C  sJ   t | tr| S t | ttfrt| S t | trt| S tdt|  )NzNYI unsupported output type: )r   r   r  rl  rL  r  r  r   )r  r   r   r   _maybe_wrap_as_tensor_boxJ'  s   


z#WhileLoop._maybe_wrap_as_tensor_boxcond_fnbody_fnr  c           &   	     s  ddl m} d*d	d
}tjjjd }tjjjd }	||	 }
dd |
D }dd |D }dd |	D }fdd|D }t|}|||}fdd|D }|||}||  ||fD ]S}|jdu rt|
t	smJ t
|
tjj|j|
|jd|_t|j' |jj|  ||u rt|jjt|ksJ ||jj||j_W d   n1 sw   Y  q[|jr|jsJ |jj}|jj}t|rtd| t|dksJ ||d }t|ts| tjksJ |t| dksJ |t dksJ d d  }|dusJ t|t|ksJ ||ftt||D ]?\}\}}d+dd}|| |  || |  | | ksOJ ||||f| | ks_J |||fq!|dushJ ttjjjtjjj !dd}t||||t"|d||d }|jdurt|jj#tj$j%sJ ||jj#|d! }t&|} fd"d|D }t'|}g }g |_(g |_)|rt|dksJ d#ttjjj d$ D ]/\} }!t*t+|!j,|!j-d%d |!. D d&d |!/ D d'|t0| fg}"|j(1|" |1|" qnVt|D ]Q\} }!| |v r/| t|k sJ d(t2|}#|j)1t3|#j4|#| |1|# qt*t+|! |! |! |! |!5 j6d)|t0| fg}"|j(1|" |1|" qt||D ]\}$}%|$7 tjj8v rrtjj9:|%7  q\|S ),zcreate the while_loop IR node. stack_output controls whether it stack
        each iterations' output, which is necessary for training.
        r   )check_input_alias_and_mutationtensor_boxesr  rf	  'list[int | torch.SymInt | torch.Tensor]r   r   c                 S  sl   t | t |ks
J g }t| |D ]"\}}t|tjr.t|}|tj	||
 dd q|| q|S rh	  )r   r   r   r)  rZ  r	  r	  r  r  rn  r  )r	  rf	  ri	  r  fknew_tbr   r   r   rj	  c'  s   

z0WhileLoop.create.<locals>._require_exact_stridesrI  c                 S  rF	  r  rG	  rt  r   r   r   r   '  r   z$WhileLoop.create.<locals>.<listcomp>c                 S  rF	  r  rG	  rt  r   r   r   r   '  r   c                 S  rF	  r  rG	  rt  r   r   r   r   '  r   c                   r,  r   r@  rt  rA  r   r   r   '  r   c                   r,  r   r@  rt  rA  r   r   r   '  r   NrH	  zOutput aliasing is currently not supported in compiled torch.while_loop. The outputs of the body_fn subgraph of torch.while_loop are aliased: rD   z9torch.while_loop is assumed to have at least one operand.	lhs_exprsSequence[int | sympy.Expr]	rhs_exprsr   c                 S  s<   t | t |ks
J t| |D ]\}}tjj|| qd S r   )r   r   rr   r  r  r  )r	  r	  lhsrhsr   r   r   _guard_list_equals'  s   z,WhileLoop.create.<locals>._guard_list_equalsr  r  )r	  r	  r	  r	  rg  r  r	  r   c                   r   r   r   rq	  )
all_inputsr   r   r   '  r  z-NYI: while_loop_stack_output input mutations.r  c                 S  rz  r   rl	  rm	  r   r   r   r   (  r   c                 S  rz  r   rl	  )r   r  r   r   r   r   (  r   r  zonly carries can be mutated.)r  r  r  r  rj  )r	  r  rf	  r	  r   r   )r	  r	  r	  r	  r   r   );rr	  r	  rr   r  r  r   r	  r	  r   r   r   r  r8	  r   r  r  r   rM	  r>	  r  r   r&  r)  r   r!  r'  r  r   rK  r8   r  r   rz  r  r2  modulefxGraphModuler=   r  r  r.  rL  ri  r  r  r  r  r   r  r  r%  rg  r#  rj  r  r   rT  r  )&r  r	  r	  r	  r	  r	  r	  rj	  fx_carried_inputsfx_additional_inputsfx_all_inputsfake_all_inputsfake_carried_inputsfake_additional_inputscarried_inputs_additional_inputs_r  cond_outputsbody_outputsr  r  r   rv  bor	  r  
while_loopmutated_idxsmutated_idx_setr  mutated_inputs_iterall_outputsr   rt  r	  mutated_inputr  r  r   )r	  r  r   r  U'  s  






"$


zWhileLoop.creater  rw   c                 C  s.   | | | j ||  | jt| di  d S r  )codegen_while_loopr	  r  r  r  r   r  r   r   r   r  /(  s   zWhileLoop.codegenr   c                 C  r  r  r  r  r   r   r   r  5(  r  z"WhileLoop.get_unbacked_symbol_defs)r	  r  r	  r  r	  r7	  r	  r7	  rg  r2  r  r  r	  r   r   r   rM  )r	  r  r   r  )r  r   r   r   )r	  r7	  r	  r7	  r	  r  r	  r  r	  r   r   r  r  r  )r   r   r   rk  r	  r   r	  r	  r	  r  r  r  rn  r	  r	  r  r  r  r  r  r   r   r   r   r	  &  s$   
 

 
Zr	  c                      s@   e Zd Z	dddd fddZd fddZdddZ  ZS )r   Nr  rg  r6  r  r   rU  r  r  r6  r?  r  r   r  r  r  r   r   c          
   	     sb   t  j|||||d |d ddlm} ||}	|	d usJ |	| _tjj|	d | _	| tjj|	< d S )Nr  r   )_get_effect)
r  r  torch._higher_order_ops.effectsr	  effect_typerr   r  effectful_opsr  prev_effect_buffer)
r  rg  r  rU  r  r?  r   r  r	  r	  r   r   r   r  A(  s   
zEffectfulKernel.__init__r5  c                   s0   t   }| jd ur|jt| j  |S r   )r  r6  r	  r:  r  rF   r  r  )r  rL  r   r   r   r6  ^(  s   

zEffectfulKernel.get_read_writesr   c                 C  r  r  r   r  r   r   r   rQ  h(  r  z EffectfulKernel.has_side_effectsr   r	  rd  rR  )r   r   r   r  r6  rQ  r  r   r   r   r   r   @(  s    	
r   c                   @  s"   e Zd Zed 	d	d
ddZdS )rj  Fr  r   r   r   c                 C  r  r   r<   r=  r   r   r   r>  m(  r  z!NonTensorObj.get_free_symbol_usesNr_  rh  )r   r   r   r]   r>  r   r   r   r   rj  l(  s    rj  c                   @  sR   e Zd ZU ded< ded< dddZddddZdddZdddZdddZdS )r  r   r   %FakeScriptObject | torch.ScriptObjectr-  r   c                 C  r  r   r   r  r   r   r   r  y(  r  zTorchBindObject.get_nameNr
  r  c                 C  r  r   r   r  r   r   r   r  |(  r  z!TorchBindObject.codegen_referencec                 C  r  r   r0  r  r   r   r   rJ  (  r  zTorchBindObject.get_valuetorch.ScriptObjectc                 C  s   t | jtjr
| jS | jjS r   )r   r-  r)  r  real_objr  r   r   r   get_real_obj(  s   zTorchBindObject.get_real_objr   c                 C  sZ   |   }t|r
dS t|dsJ t| }t|d }dd |D }tt	j
|dS )Nr   __obj_flatten__c                 S  s(   g | ]}t |tjr| |  qS r   )r   r)  rZ  r  numelrt  r   r   r   r   (  s    
z1TorchBindObject.get_buf_bytes.<locals>.<listcomp>)r	  r)   r  r   r	  r8  rB  rW  ru  operatorr  )r  real_script_obj	flat_dict
flat_elems
flat_sizesr   r   r   get_buf_bytes(  s   zTorchBindObject.get_buf_bytesr\  r   rV  )r   r	  )r   r	  rf  )	r   r   r   r   r  r  rJ  r	  r	  r   r   r   r   r  t(  s   
 


r  c                   @  s0   e Zd ZU dZded< dddZddddZdS )OpaqueValueTypeConstanta  IR node for opaque value type constants that appear directly in graph outputs.

    Unlike TorchBindObject (which references named constants loaded at runtime),
    this inlines the value's repr into the generated code since value types are
    reconstructed from their repr.
    r   r-  r   r   c                 C  rB  r   )rE  r-  r  r   r   r   r  (  r  z OpaqueValueTypeConstant.get_nameNr
  r  c                 C  s0   t | j\}}| D ]
\}}|tjj|< q|S r   )r(   r-  r  rr   r  opaque_value_type_classes)r  r
  obj_repropaque_typesr  r,  r   r   r   r  (  s   z)OpaqueValueTypeConstant.codegen_referencer\  r   rV  r   r   r   rk  r   r  r  r   r   r   r   r	  (  s
   
 
r	  c                   @  s4   e Zd ZU ded< ded< dddZddddZdS )rD  r   r   r  r  r   c                 C  r  r   r   r  r   r   r   r  (  r  zGeneratorState.get_nameNr
  r  c                 C  r  r   r   r  r   r   r   r  (  r  z GeneratorState.codegen_referencer\  r   rV  )r   r   r   r   r  r  r   r   r   r   rD  (  s
   
 
rD  c                   @  s8   e Zd ZU dZded< ded< dddZddddZd	S )rG  z
    Represents an opaque object (e.g., ProcessGroup) that is passed through
    as a graph input. Similar to GeneratorState, this wraps the object with
    its placeholder name so codegen can reference it properly.
    r   r   r   r-  r   c                 C  r  r   r   r  r   r   r   r  (  r  zOpaqueObjectState.get_nameNr
  r  c                 C  r  r   r   r  r   r   r   r  (  r  z#OpaqueObjectState.codegen_referencer\  r   rV  r	  r   r   r   r   rG  (  s   
 
rG  c                   @  sH   e Zd ZdddZdddZddddZedddZedddZdS )_CollectiveKernelr   r   c                 C  r  r  r   r  r   r   r   r  (  r  z!_CollectiveKernel.should_allocatec                 C  r  r  r   r  r   r   r   rQ  (  r  z"_CollectiveKernel.has_side_effectsNr  r3  r   c                 C  sR   t | jtjju sJ d| j}|d ur|| _n|jj| _dd |jjD | _	d S )Nz,Setting cpp kernel needs a valid op_overloadc                 S  r  r   r  rt  r   r   r   r   (  r	  z9_CollectiveKernel.set_cpp_kernel_name.<locals>.<listcomp>)
r   r  r)  r
  r  r  r  r   r  r  )r  r  r  r   r   r   r  (  s   
z%_CollectiveKernel.set_cpp_kernel_namer  r   r~  IRNode | list[IRNode]r   r   r   c                   s$  t jj | j||g|R i |\}}}}}	W d    n1 s!w   Y  |	r1J | d|	 |D ]}
|
  t j|
  q3|d   | t d||||t	
|}j fdd|D  jdd |D  d|v rjtt d|d  j|d   d S d S )Nr  r   r  c                   r"  r#  r$  r&  r  r  r   r   r   
)  r  z4_CollectiveKernel.create_inplace.<locals>.<listcomp>c                 S  r  r   r  r	  r   r   r   r   )  r  r  )rr   r  r#  ra  r	  r  r  r'  r  r8  tree_leavesr.  r   r  r  r%  )r  r  r~  r   r   _example_outputrU  rV  r?  r  
tensor_arginpsr   r	  r   create_inplace(  s@   

z _CollectiveKernel.create_inplaceTensorBox | list[TensorBox]%list[MultiOutput] | _CollectiveKernelc                   sJ  t jj  j||g|R i |\}}}}}	W d    n1 s!w   Y  |	r1J | d|	 |D ]}
t|
ts>|
  q3t|tr ||}|d usPJ  t	|d|||| fddt
|D _tj|D ]\}}tjsyt|st jj|j qnjS   |||||tjst|st jjj g_S )Nrt  r  c                   s(   g | ]\}}t  |t|fgqS r   )rL  r]  r   )r   r   r]  r  r  r   r   r   K)  s    z9_CollectiveKernel.create_out_of_place.<locals>.<listcomp>)rr   r  r#  ra  r   r  r	  r   r  r2  r  r  r   rE   r  ro   rW  r  r   r]  )r  r  r~  r   r   r\  rU  rV  r?  r  r	  r  rO  r]  r   r	  r   create_out_of_place,)  s^   


z%_CollectiveKernel.create_out_of_placerR  r   r  )
r  r   r~  r	  r   r   r   r   r   r   )
r  r   r~  r	  r   r   r   r   r   r	  )	r   r   r   r  rQ  r  r  r	  r	  r   r   r   r   r	  (  s    

Ar	  c                      2   e Zd Z	dddd fddZdddZ  ZS )_AllReduce_KernelNr  rg  r6  r  r   rU  r  r  r6  r?  r  r   r  r  r  r   r   c             	     (   t  j|||||d |d | d d S )Nr  +aoti_torch_cpu__c10d_functional_all_reduce_r  r  r  r	  r   r   r   r  j)     	z_AllReduce_Kernel.__init__r  rw   c                 C  2   | d ||  t| jtr| | d S d S Nz+torch/csrc/inductor/aoti_torch/c/shim_cpu.hinclude_extra_headerr  r   rg  r  r  r  r   r   r   r  )  
   

z_AllReduce_Kernel.codegenr   r	  r  r  r   r   r   r   r	  i)      	r	  c                      r	  )_AllReduceKernelNr  rg  r6  r  r   rU  r  r  r6  r?  r  r   r  r  r  r   r   c             	     r	  )Nr  *aoti_torch_cpu__c10d_functional_all_reducer	  r	  r   r   r   r  )  r	  z_AllReduceKernel.__init__r  rw   c                 C  r	  r	  r	  r  r   r   r   r  )  r	  z_AllReduceKernel.codegenr   r	  r  r  r   r   r   r   r	  )  r	  r	  c                      sX   e Zd Z	d"ddd# fddZd$ddZd%ddZed&ddZd' fd d!Z  Z	S )(_WaitKernelNr  rg  r6  r  r   rU  r  r  r6  r?  r  r   r  r  r  r   r   c             	     r	  )Nr  +aoti_torch_cpu__c10d_functional_wait_tensorr	  r	  r   r   r   r  )  r	  z_WaitKernel.__init__r  rw   c                 C  r	  r	  r	  r  r   r   r   r  )  r	  z_WaitKernel.codegenc                 C  s   | j d }t|tsJ t|tr$|j d }t|ts!J t||gS t|trB|j d }t|tr@|jd \}}|j | gS g S g S r  )r~  r   r   r	  r   rL  rM  )r  r  r   collr
  r   r   r   r   get_volatile_reads)  s   





z_WaitKernel.get_volatile_readsr  r   c           	      C  s   t jj | ||\}}}}}W d    n1 sw   Y  |r*J | d| | t| d||||}|jtt| d|| d S )Nr  r  )	rr   r  r#  ra  r  r'  r.  r  r%  )	r  r  r  r	  rU  rV  r?  r  r  r   r   r   create_wait)  s(   

z_WaitKernel.create_waitr5  c                   s6   t   }|  }|D ]}|jt|  q|S r   )r  r6  r	  r:  r  rF   r  r  )r  rL  volatile_readsvrr   r   r   r6  )  s
   
z_WaitKernel.get_read_writesr   r	  r  r  )r  r   r  r   r   r   rd  )
r   r   r   r  r  r	  r  r	  r6  r  r   r   r   r   r	  )  s    	

r	  r   r  c                 C  d   t | ttfrt| S t | ttfr%ttj  }| D ]}|t	|O }q|S t | t
jr/t| S t S r   )r   r:   r   r3   r   r   r=   r   r    r  r)  rZ  r   r  r,  r   r   r   r   *     r  c                 C  r	  r   )r   r:   r   r2   r   r   r=   r   r    r  r)  rZ  r	  r   r   r   r  *  r	  r  r  r   r  torch.fx.Nodec                 C  s   t | trht | jtrjt | jjtr| jjd| d S t | jjtrl| jjd| t | jjtrEt | jjjtrE| jjjd| d S t | jjtrn| jjj	spt | jjj
d trr| jjj
d d| d S d S d S d S d S d S d S )Nr  r   )r   r   rf  r  r  r  r  r  rL  rM  r~  )r  r  r   r   r   assign_origin_node *  s&   	
r	  )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r_  )r   r   r  r   r   r   )r   r   r  r   r   r  )r   r  r  r   r   r  )r-  r.  r   r/  )r   r2  r   r3  )r   r8  r   r   )r   rD  rE  r   r   r   )rT  rU  rV  rU  rW  rU  r   r   )r]  r   r^  r_  r   r   )rr  rs  r   r   )r~  r  r   r  )r   r  r  r  r   rq   rN  )r  r   r  r  r  r   r   r  )r  r   r  r  rj  r   r   r  r~  )TFNFN)r   r   rE  r   rL  r   rM  rN  r*  r   r2  rN  r   rO  )r   r   rM  r   r   r   rG  )r  rU  rW  rU  r   r   )r  r  r   r   )r   r  r   r  )r	  rb  r   r   )r9	  r  r   r   )r   r  r   r}	  )r   r   r   r  )r  r   r  r	  r   r   (\  
__future__r   ro  r  rl  rW  rv  loggingr	  textwrapr  collections.abcr   r   r   r   r   r   r	   enumr
   r   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   unittest.mockr   r   r   r   r    torch._export.serde.schema_exportserder  r  r  r  rz  r  torch._loggingr)  torch.fxtorch.utils._pytree_pytreer8  torch._dynamo.utilsr!   torch._export.serde.serializer"   *torch._higher_order_ops.auto_functionalizer#   torch._inductorr$   r  r&   "torch._library.fake_class_registryr'   torch._library.opaque_objectr(   r)   torch._prims_commonr*   r+   r,   r-   r.   r/   %torch.fx.experimental.symbolic_shapesr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   torch.fx.noder;   torch.utils._ordered_setr=   torch.utils._python_dispatchr>   torch.utils._sympy.functionsr?   r@   rA   rB   torch.utils._sympy.symbolrC   r  rE   rF   codegen.commonrG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   	loop_bodyrR   ops_handlerrS   rT   rU   rV   runtime.benchmarkingrW   runtime.hintsrX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   virtualizedrp   rq   rr   rs   rt   torch.typesru   codegen.cutlass.templaterv   codegen.wrapperrw   r  rx   ry   r   r   r=  __version__r3  r2  ImportErrorrz   r{   r|   r}   r   r~   r  r   r
  r  r+  r   	getLoggerr   r\  r  r  r   r   r   r   r  r   r   r   r   r   r   r]  r_  r   r  r  r  r1  r7  rB  rC  rS  r\  rq  r}  r  r  r   rr  r  r  r  r  r  r  r  r  r  r  r  r  r  r4  r7  r"  rK  rh  rT  rU  r  r   r  r  r  r  rl  r  r  r  r  r	  r  r  r6  r  ri  r$  ru  r  r  r  r  r  r  r  r  r  r  r   r  r  r  rh  r   r   PrimitiveInfoTyperp  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r%  r  r  r  r  r  rC  rK  rO  rR  rU  rZ  r`  rf  rm  rw  r   r   r  r\  r	  r	  r2  rL  r   r	  r  r  r  r   r  r7	  r>	  r?	  rT	  rZ	  r	  r   rj  r  r	  rD  rG  r	  r	  r	  r	  r  r  r	  r   r   r   r   <module>   s   4  4 `
*

#	

"
'
	
  XX 
(#?      / :& d K <
a^0D$   V)o  % _$"Y 	
   
:	 a,?O_]U O       O%+!@(?  \-29,</5?    { L#k q	  i  E,% Y