o
    j9:ja                    @  s8  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlZd dlZd dlZd dl Zd dl!m"  m#Z$ d dl%m&Z& d d	l'm(Z( d d
l)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z>m?Z? d dl@mAZAmBZBmCZC d dlDmEZE d dlFmGZGmHZHmIZImJZJmKZK ddlLmMZM ddlNmOZOmPZPmQZQmRZR ddlSmTZTmUZU ddlQmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe ddl"mfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZq ddlrmsZsmtZt erCddlumvZv edZwedZxeEddgZyeze{Z|i Z}d e~d!< i Zd"e~d#< i Zd$e~d%< eEejj  ZejsjZejsjZejsjZeEejj  ZeEejj ejjgZeEejj  Zi Zd&e~d'< ejsjZd,d*d+Zd-d/d0Zd.d4d5Zd/d9d:Zd0d?d@Zd1dCdDZd2dHdIZeejejejejejejejejejejejejejejg ejejejejejejejejejejejejejdJZd3dNdOZd4dSdTZd5dVdWZdXdYd6d^d_Zd`da Zd7dedfZd8djdkZd9dsdtZd:dxdyZd;d{d|ZdXe7jdXe}fd<d~dZdd Zd=ddZdd Z				X	X	d>ddZd?ddZdd Z	d@dAddZeejjdddd ZeejdddBddZdXddBddZeejjdddBddZdXdXddCddZeejdddDdCddZdde7jdXdddXdXdf	ddZende7jdd eejddddEddZdd ZeЃ  	X	d?ddZeejdXdddd ZeejdXdddd Zeejejejejejgdd Zeedr/eejۃeك eejdddFddZeejdddFddZeejgdFddZeej߃dd Zeejdd Zeejdd Zeejdd ZeejjddÄ Zeejddń ZeejddddǄ ZeejddddɄ Zeejdddd˄ Zeejdd̈́ ZeejddeejddeejdddGddфZeejddddӄ Zeejddd d e
jddfddՄZeejdddFddׄZeejdddFddلZeejdddFddۄZdHdd݄ZeejdddIddZdd Zeejjdd Zeejjdd ZeejdddddJddZeejjdddKddZeejjdddddLddZeejjdddMddZ eejjdddddNddZeejdHddZeejdddOdPddZeejdddOdPddZeejdddOdPddZeejdddd	 ZeejdddHd
dZeejdddHddZeej	dddHddZ	eej
dddd Z
eejdddd Zeejdddd ZdHddZeejdQddZdRddZejdd ZdFdSd!d"ZdFdSd#d$ZdRdTd'd(Z			X	dUd)d*Zd+d, Zeejsjjddd-d. Zeejddd/d0 Zeejddd1d2 Zeejjddd3d4 Zeejd5d6 Zedd7d8 Z d9d: Z!eej"jZ#eej"j$Z%eej&jZ'eej&j$Z(eej) eej*dd; eej+dd; eej,dd; eejsj-j.j eejsj-j/j eejsj-j0j eejsj-j1j eej"d<d= Z"eej&d>d? Z&eePj2ddd@dA Z3eePj4dddVdBdCZ5eePj6dddDdE Z7eePj8dddFdG Z9dVdHdIZ:eePj;ddd ejdJdWdPdQZ<eePj= eePj> eePj)ddd dRdXdUdVZ?dYdYdZZ@dZd\d]ZAeejBjCdddXdXddd^d[dgdhZBeejDjCe7jEddXdXdid\dkdlZDdmdn ZFdodp ZGdqdr ZHdsdt ZIdudv ZJdwdx ZKdydz ZLd{d| ZMeejjeM d}d~ ZNeejO eejP eejQ eejRdXd eejSjdXd eejTeH eejUdXd eejVdXd ejWX reejYdXd ejZX reej[dXd eej\ eej] eej^ eej_j eej`j eeja eejbjc eejdj eejej eejf eejgdXd eejheG eejeM eejieG eejjeH eejkeG eejleH eejmeH eejneH eejo eejp eejp eejq eejr eejs eejteG eeju eejv eejw eejx eejy eejz eej{ eej| eej}eH eej~ eej eejeH eej eej eej eej eej eej eej eej eej eej eej eej eej eej eej eej eej eej eej eej eej eejj eej eej eej eej eej eej eej eejj eejjdXd eejeG eejjj eejjj eejjj eejjj eej eej eejeH eej eej eej eej eej eejj eejjeNdXd eejjeNdXd eejjeNdXd eejjdXd eejjeNdXd eejjeNdXd eejjeNdXd eejjeNdXd eejjeNdXd eejjeNdXd eejjeNdXd eejjeN eejj eejjeN eejjeN eejjeN eej eejjCdd; eejjeH eejdddDddZeejŃddddZŐdd ZeedreejǃeŃ eejȃdd Zeejɐjdd	d]dddddXdddZeejddd^ddZeejddd_ddZ͐dd ZeejejejgddddXdddZeejЃd=ddZeejуdd ZeejӃdd Zeejԃdd ZeejՃ	d`dddddZՐdd Z֐dd Zאdd ZeejejgdddddddddZِdd Zڐdd Zeej܃eڐeكZܐeڐedZݐeڐed Zސdd ZeejdddddddZeejdddddddZeejdddddddZeejjdd Zeejejgdd ZeejdddDddZeejdddaddZdd Z	dRddÄZdĐdń ZdRdƐdǄZeejdddȐd Zeejdddɐdʄ ZeejdddDdːd̄ZeejdDd͐d΄ZdϐdЄ Zdѐd҄ ZeejdddDdӐdԄZeePjdddDdՐdքZdDdאd؄ZeejjdXdٍZeejjdXdٍZeejdddڐdۄ Zeejdddܐd݄ Zedސd߄ ZeejdddFddZeejdddbddZddddcddZeejddddddddZeejdddbddZeejdddbddZeejdddbddZeej dddddeddZ 		XdfdgddZeejjdFdhddZeejjdFdhddZeejj	d=diddZeejj	d=diddZeejj			d`djddZeejj			d`djd	d
Zdd Zeejjdd ZdkddZdkddZ	eej
dddHddZ
dlddZdmddZdndd Z	dod"d#Zdd$d%d&Zd'd( Zdd)d*d+Zd,d- Zeejdd	XdDd.d/Zdpd6d7Zeejddd8d9 Zd:d; Zeejdd		 		Xdqd<d=Zeejdd		 		Xdqd>d?ZeejjdXdٍZeejddd@dA ZdrdCdDZdEdF ZdGdH ZdIdJ ZeejjdXdٍZeejdKdL ZeejjdXdٍZeejdMdN ZdOdP Z eej!dQdR Z!eej"dSdT Z"dUdV Z#eej$j	dsdWdXZ$eejdd	Y	 	X		dtdZd[Zeej%dd	Y	 	X		dtd\d]Z%eej&jdXdٍeejjdXdٍeej%jdXdٍgZ'd^d_ Z(eejjdXdٍZ)eejdd	dFd`daZeej*jdXdٍZ+eej*dd	dFdbdcZ*ddde Z,ddfdgdhZ-dFdudkdlZ.dmdn Z/eej0dvddodpdqZ0drds Z1dtdu Z2dvdw Z3dxdy Z4eej5ej5gdFddXdzd{d|Z6eej7dFddXdzd}d~Z7dd Z8edd Z9eej:j;dXdٍZ<eej:j=dXdٍZ>eej:j?dXdٍZ@eej:dddd Z:dDddZAeejBdd ZBeejCdddDddZCedd ZDedd ZEedd ZFeejGdddFddZHeejIgdddd ZIdwddZJeejGgdddd ZKeejLejGjCgde7jddd ZGeejMejMgdddd ZMeejNejNgdvddoddZOeejPjZQeejRjZSeejTjZUeejVjZWeejXjZYeejPd=ddZPeejRd=ddZReejTdd ZTeejVdddFddZVeejXdddFddZXeejZdvddoddZZeej[dvddZ\eej]dddvddZ^eej_dddvddZ`eejae.d eejbe.dZceejde.dZeeejfe.dejdZgeejhe.dejdZieejjddddZjeejkjldXdٍZmeejkjldddddXdddZneejkjdddxddZkeejojdXdٍZpeejqjdXdٍZreejsjdXdٍZteejsjudXdٍZveejwjdXdٍZxeejsjdddd ZyeejsjudddDdÐdĄZzeejwjdddxdŐdƄZ{eejojdddydǐdȄZoeejqjdddxdɐdʄZqd=dːd̄Z|dzdΐdτZ}e|ej~Z~e}ejZe|ejZe|ejZeejZe}ejZe}ejZeejZeejddЍZeejddddќdҐdӄZeejddddќdԐdՄZeejj=edd֐d׍Zeejj=edd֐d׍Ze}ej e}ej eejZeejZeejZeejdؐdٍZeejZeejZeejZe|ej e|ejZeeje7jde e|ej e|ej e|ej e}ej| eejddejdڍZeejddejdڍZeejddejdڍZeejddejdڍZeejZeejZeeje eeje eejZeejZe|ejZeej eejdېdٍZeej eejejd eeje eejejd eejejd eejejd eejejdZeejejd eejejd e|ej e|ej e|ej e|ej e|ej e|ej e|ej e|ej e|ej e|ej e|ej e|ej e|ej e|ej e|ej e|ej ddܐlmZmZ dݐdބ ZÐeD ]@ZĐeeeăD ]\ZŐZƐZe̐eŐeĐeƐeǐdߍ qeeeăD ]\ZŐZƐZe̐eŐeĐeƐeǐdߍ q%qeejȐjɐejddЍZeejȐj=ejddЍZeejȐjCejddЍ eej̐jɐeIZeej̐jCeI eej̐j=eIZeejϐjɐe eejϐj=e eejje eejje eejҐj=e: eejҐjɐe: eejҐjӐe: eejԐjɐeGZeejԐjCeG eejԐj=eGZeejאe eejؐe~ eejِjɐe eejِj=e eejڐjɐe eejڐj=e eejېjɐe eejېj=e eejܐjɐe eejܐj=e eejݐe eejސe eejߐeŃ eejeăZdd ZeejjejȐjɐeʃ eejj=ejȐj=e˃ eejjej̐jɐe̓ eejj=ej̐j=e΃ eejjejԐjɐeՃ eejj=ejԐj=eփ eejjejje eejj=ejj=e eejj=ejj=e dd Zeejej eeje eeje eeje eeje eeje eeje eejeI eejjCeG eejjeH eeje eeje eeje eeje eeje eeje eeje eeje eeje eeje eeje eeje eej ej eejej eejej eejej eejej eejd=ddZeejjdd Zeejjdd Zeej	dd Z	e>
 D ]\ZZee?ee qCeejdd Zeejdd Zeejsjjdd Zeejsjjdd Zeejsjjjdd ZeejsjdreejsjjCjdd ZeejsjjddddZd dlmZ ee ee*dd Zeejsjjddd{dd;ZeejsjjdddDdd Zeejsjj ddej!edd eejsjj"ddd|ddZ"e# Z$d}d	d
Z%d dl&m'Z' ee'dddd Z(eejj)ddddd~ddZ*ee(dddddZ+eejsjj,jdd Z,eejsjj-jdd Z-eejsjj.dddd Z.ddl/m0Z0m1Z1 e0  e1  eePj2dddd Z2dd  Z3eePj4ddd!d" Z5eejj6ddddd#d$d%Z7dd&lNm8Z8 eMe8 dd'lNm9Z9 e9:  e9;  dd(lNm<Z< e<=  dd)lNm>Z> e>?  ej@dd*d+ZAdS (      )annotationsN)defaultdict)Callable
CollectionIterableSequence)AnycastTYPE_CHECKING	TypeGuardTypeVar)	ParamSpec)patch)counters)associative_scan_op)triton_kernel_wrapper_mutation)FakeScriptObject)is_opaque_value)get_layout_constraint_tag)canonicalize_dimcanonicalize_dimscheckdtype_to_typeelementwise_dtypesELEMENTWISE_TYPE_PROMOTION_KINDget_computation_dtypeis_boolean_dtypeis_float_dtypeis_integer_dtypeNumber)magic_methodsmethod_to_operator)free_unbacked_symbolshas_free_unbacked_symbolsresolve_unbacked_bindings)
OrderedSet)CeilDivFloorDivIdentityModModularIndexing   )import_submodule   )configinductor_primsirtest_operators)decompositionsget_decompositions)BaseView	DtypeView
ExpandViewIndexingConstantIRNode	is_triton
MutableBoxOnlineSoftmaxReductionops_wrapperPermuteView	Pointwise	ReductionSqueezeView	TensorBoxvalidate_irView)ceildivdecode_device
is_dynamicis_gpuis_pointwise_useis_view,needs_fallback_due_to_atomic_add_limitationspad_listlike#register_op_dtype_propagation_rules#register_op_requires_libdevice_fp64sympy_productuse_scatter_fallback)opsV)ReductionType_T_Pztorchvision::roi_alignzaten::index_add2dict[Callable[..., Any] | str, Callable[..., Any]]	loweringsz/dict[torch._ops.OpOverload, Callable[..., Any]]user_loweringsz6dict[torch._ops.OpOverload, Callable[..., Any] | None]_maybe_layout_constraintsz2dict[torch._ops.OpOverload, torch._ops.OpOverload]inplaceable_foreach_opsreturnboolc                  C  s<   t jjjD ]} | jD ]}|jdkr|jtv s  dS q
qdS )Ncall_functionTF)rQ   graphcurrent_nodeusersoptargetforeach_ops)nodeuser re   _/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/_inductor/lowering.pycur_node_has_non_foreach_users   s   
rg   	arg_pairsIterable[Any]4defaultdict[tuple[Any, bool], list[tuple[int, Any]]]c                 C  s   t t}d}t| D ]A\}}t|tsd}|f}t|  ptj}d }|D ]}t|tr2|j	
 } nq$|d us;J d|r@|\}|||f ||f q
|S )NFTz.foreach op should have at least one tensor arg)r   list	enumerate
isinstancer   rF   r.   #combo_kernel_foreach_dynamic_shapesrA   data
get_deviceappend)rh   outunpack_argsiargsuse_foreachdevicetre   re   rf   group_foreach_args   s&   


ry   fnCallable[..., Any]Callable[..., Any] | Nonec                 C  s>   t | tjjs	dS t| dd }rt|S | tv rt|  S dS )zHGet layout constraints. Returns None if there are no layout constraints.NF)with_default)rm   torch_ops
OpOverloadr   tag_to_layout_constraintrX   )rz   maybe_layout_tagre   re   rf   maybe_layout_constraints   s   r   tagtorch._C.Tag%Callable[..., tuple[Any, Any]] | Nonec                 C  sV   | t jjjkr	tS | t jjjkrtS | t jjjkrtS | t jjj	kr$d S t
d|  )NzUnknown layout constraint tag: )r~   _CTagneeds_exact_stridesconstrain_to_fake_tensorsneeds_contiguous_stridesrequire_contiguous_stridesneeds_fixed_stride_orderconstrain_to_fx_stridesflexible_layoutAssertionError)r   re   re   rf   r      s   r   condmsgstrNonec                 C  s   | s	t d| d S )Nzinductor does not support NotImplementedErrorr   r   re   re   rf   
assert_nyi   s   r   uCollection[torch._ops.OpOverload | torch._ops.OpOverloadPacket] | torch._ops.OpOverload | torch._ops.OpOverloadPacketlist[Any] | Nonec                   sj   t  ttttfrdd  D S t  tjjrt	  d S t  tjj
r3t fdd  D  d S )Nc                 S  s   g | ]}t |qS re   )add_needs_realized_inputs.0xre   re   rf   
<listcomp>       z-add_needs_realized_inputs.<locals>.<listcomp>c                 3  s    | ]}t  |V  qd S N)getattr)r   overloadrz   re   rf   	<genexpr>   s    

z,add_needs_realized_inputs.<locals>.<genexpr>)rm   rk   settupler%   r~   r   r   needs_realized_inputsaddOpOverloadPacketupdate	overloadsr   re   r   rf   r      s   
r   3torch._ops.OpOverloadPacket | torch._ops.OpOverload
constraintCallable[..., tuple[Any, Any]]c                 C  s:   t | tjjr|  D ]	}|tt| |< qd S |t| < d S r   )rm   r~   r   r   r   rX   r   )rz   r   r   re   re   rf   add_layout_constraint   s
   r   )r   r-   r+                     	   
         dtypeint | torch.dtypetorch.dtypec                 C  s2   t | ts| S | tv sJ d|  dt|  } | S )Nzid z missing from DTYPE_ID_LOOKUP)rm   intDTYPE_ID_LOOKUPr   re   re   rf   decode_dtype  s
   
r   r   r   'TypeGuard[TensorBox | sympy.Expr | int]c                 C  sB   t | trt|  pt|  S t | tjr| jdu S t | tS NT)	rm   rA   r   	get_dtyper   sympyExpr
is_integerr   r   re   re   rf   is_integer_type  s
   


r   TypeGuard[TensorBox | bool]c                 C  s    t | trt|  S t | tS r   )rm   rA   r   r   r[   r   re   re   rf   is_boolean_type&  s   

r   F)return_compute_dtyperu   type_promotion_kindr   r   c                   s:   d	dd  fdd|D }t |d| i\}}|r|S |S )
Ninpr   rZ   c                 S  s8   t | ttjfr
| S t|  }tjdg| |  dS )Nr-   r   )	rm   r   r   Basiclenget_sizer~   zerosr   )r   dimre   re   rf   construct_input2  s   z+get_promoted_dtype.<locals>.construct_inputc                      g | ]} |qS re   re   )r   argr   re   rf   r   :  r   z&get_promoted_dtype.<locals>.<listcomp>r   )r   r   rZ   r   )r   )r   r   ru   inpscompute_dtyperesult_dtypere   r   rf   get_promoted_dtype-  s   

r   c                 C  sh   t | ttfs| g} nt| } t| D ]}t |tjjr1| D ]}t||}|tvr0| 	| q q| S r   )
rm   rk   r   r~   r   r   r   r   rV   rq   )aten_fnrz   r   other_fnre   re   rf   get_overloadsA  s   

r   r`   9Any | torch._ops.OpOverloadPacket | torch._ops.OpOverload	namespacec                 C  s6   t | tjjr|| jv S t | tjjr||  v S dS NF)rm   r~   r   r   _qualified_op_namer   name)r`   r   re   re   rf   in_namespaceQ  s
   
r   rA   rw   torch.devicec                 C  s   t | jtjrt|  r| S tjj	|  }| 
 }|durH|jdkrH||krHt|dks;t|dkrH|d dkrHtttj| |dS | S )zB
    Copy cpu scalar if doesn't not match with given `device`
    Ncpur   r-   F)rm   ro   r0   ReinterpretViewr#   r   rQ   r]   sizevarsguarding_hints_or_throwrp   typer   rA   
StorageBox
DeviceCopycreate)r   rw   size
cur_devicere   re   rf   maybe_copy_cpu_scalar[  s   
$r   	list[Any]kwargsdict[str, Any]	broadcast&ELEMENTWISE_TYPE_PROMOTION_KIND | Noneconvert_input_to_bool tuple[list[Any], dict[str, Any]]c                   s  dd t  D }dd  D }|s|s fS |s|r|r$tjndd  D }|dd  D  t|d|i|rF |d  n|d   |D ]}t |  |< qP|D ]}	t|	 |	< q^dfddfdd D  fdd D |rt	t
t fdd|D fdd|D  }
t
|
d  }t||
dt| D ]\}}| |< qt||
t|d D ]\}	}||	< qtt D ]}t | tjrt | | |< q׈D ]}	t|	 tjrt|	 ||	< q fS )zB
    Transforms arguments for broadcasting and type promotion
    c                 S     g | ]\}}t |tr|qS re   rm   rA   r   rt   r   re   re   rf   r   z      z"transform_args.<locals>.<listcomp>c                 S  r   re   r   r   kvre   re   rf   r   {  r   c                 S  s*   g | ]}t |ttjfst|d r|qS r   )rm   r   r   r   hasattrr   are   re   rf   r     s    c                 s  s    | ]
}t |d r|V  qdS )r   N)r   r   re   re   rf   r         z!transform_args.<locals>.<genexpr>r   r   r   r   rZ   c                   s6   t | tr
t| S t | tjrtj| j dS | S )Nvaluer   rw   )rm   rA   to_dtyper0   Constantr  )r   )rw   r   re   rf   promote  s
   

ztransform_args.<locals>.promotec                   r   re   re   r   r  re   rf   r     r   c                   s   i | ]	\}}| |qS re   re   r   r  re   rf   
<dictcomp>      z"transform_args.<locals>.<dictcomp>c                 3      | ]} | V  qd S r   re   r   rt   ru   re   rf   r         c                 3  r  r   re   r   r   r   re   rf   r     r  N)r   r   rZ   r   )rl   itemsr~   r[   extendvaluesr   rp   r   broadcast_tensorsrk   	itertoolschainr   zipr   rangerm   r0   r  r6   r   )ru   r   r   r   r   args_indiceskwargs_indicespromoting_argsrt   r   broadcastedr   r   re   )ru   rw   r   r   r  rf   transform_argso  sb   

r  r   torch._ops.OpOverload	decomp_fnc                   s@   t  d fdd}t| }t| tt|| |S )	a  
    Add a foreach lowering to lowerings dict.

    Arguments:
        aten_fn: torch.ops.aten.* fn we are lowering
        decomp_fn: alternate implementation on our IR
        broadcast: True to apply broadcasting to tensor inputs
        type_promotion_kind: kind of type promotion applied to tensor inputs, `None` means no type promotion
        convert_input_to_bool: some logical ops require inputs are converted to bool
    ru   r   r   rZ   c                    s    | i |}t | |S r   )rB   )ru   r   rr   r  re   rf   wrapped  s   z+_register_foreach_lowering.<locals>.wrappedN)ru   r   r   r   rZ   r   )	functoolswrapsr   rb   r   rV   dictfromkeys)r   r  r!  aten_fnsre   r   rf   _register_foreach_lowering  s   
r'  lowering_dictc                   s<   t  fdd}t  |t | |S )a  
    Add a lowering to lowerings dict

    Arguments:
        aten_fn: torch.ops.aten.* fn we are lowering
        decomp_fn: alternate implementation on our IR
        broadcast: True to apply broadcasting to tensor inputs
        type_promotion_kind: kind of type promotion applied to tensor inputs, `None` means no type promotion
        convert_input_to_bool: some logical ops require inputs are converted to bool
    c                    s   t | } t|}d}t| dkr!t| d t tfr!d}t | d } tdd  D s7tdd |D r7J dt| |\} }|rF| g} | i |}t| |S )	NFr-   r   Tc                 s  s"    | ]}|t v pt|d V  qdS )_c10d_functionalN)	fallbacksr   )r   rz   re   re   rf   r         
z6_register_lowering.<locals>.wrapped.<locals>.<genexpr>c                 s      | ]}|d kV  qdS )rr   Nre   r   re   re   rf   r     r  zout= ops aren't yet supported)	rk   r$  r   rm   r   allanyr  rB   )ru   r   unpackedrr   r   r   r   r  r   re   rf   r!    s$   
z#_register_lowering.<locals>.wrapped)r"  r#  r   r   r$  r%  )r   r  r   r   r   r(  r!  re   r0  rf   _register_lowering  s
   r1  .Callable[[Callable[_P, _T]], Callable[_P, _T]]c                 C  s   t jt| ||||dS )z+
    Shim to support decorator syntax.
    )r   r   r   r(  )r"  partialr1  )r   r   r   r   r(  re   re   rf   register_lowering  s   r4  c                 C  s   t |}| r
| |kr|S g }tjt| t|tjjdD ]A\}}tjj	
|r-|| qtjj	
|r:|| qtjj	|| tt|jtt|jk rX|| q|| qt t|S )z
    Broadcasting logic based on symbolic shapes.

    We give the shapes 0 and 1 concrete values, while all other shapes
    are symbolic sympy formulas.
    )	fillvalue)r   r  zip_longestreversedr   SOnerQ   r]   r   is_size_one_or_falserq   check_equalsr   expandfree_symbols)r  boutputr   yre   re   rf   broadcast_symbolic_shapes%  s   $ rA  c              
     s`  |d u s|d u sJ d|d u r|d u rt j}tdd | D s"| S tdd | D rC|p3t| d|ifdd  fdd	| D S td
d | D }| |tjkrdtj	tj
fv rdfdd}ndd }g }| D ]A}t|ttfr|ttj||| dt|  qlt|tjr|tt|| dt|  ql|| ql|S )NzEonly one of override_return_dtype or type_promotion_kind may be givenc                 s  s"    | ]}t |tjttfV  qd S r   )rm   r   r   r   floatr   re   re   rf   r   G       z$promote_constants.<locals>.<genexpr>c                 s  s"    | ]}t |tttjfV  qd S r   )rm   r   rB  r   r   r   re   re   rf   r   I  rC  r   c                   s4   t | tjrtj|  td dS tj|  td dS )Nindexr   rw   r  )rm   r   r   r0   r7   rE   r  r   r   re   rf   
const_funcP  s
   
z%promote_constants.<locals>.const_funcc                   r   re   re   r   )rF  re   rf   r   X  r   z%promote_constants.<locals>.<listcomp>c                 s  s&    | ]}t |tttjfr|V  qd S r   )rm   rA   r6   r0   r  r   re   re   rf   r   Y  s   $ c                   s   t j|  d S Nr   )r~   tensoritemr   )tensor_dtypere   rf   <lambda>a      z#promote_constants.<locals>.<lambda>c                 S     | S r   re   rJ  re   re   rf   rL  c      r  rD  )r   DEFAULTr.  r-  r   nextr   r~   r[   bfloat16float16rm   r   rB  rq   r6   r   r0   r  get_device_or_errorrk   r   r   r   r7   )inputsoverride_return_dtyper   ex_round_scalarrr   r   re   )rF  r   rK  rf   promote_constants?  s^   



	rY  c                   sL   t | |tjd|   |  fdd}tj|  ||  dS )zBCompute a + alpha * b using FMA for CUDA floating-point precision.r   c                   sD    | }| }t tjrt}nt}t|||S r   )rm   r   r   rP   
index_exprconstantfma)idxa_valb_val
alpha_expra_loaderalphab_loaderr   re   rf   inner_fn  s   z%_add_with_alpha_fma.<locals>.inner_fnrw   r   rf  ranges)r   r   rP  make_loaderr>   r   rp   r   )r  r>  rd  rf  re   rb  rf   _add_with_alpha_fma  s   	rj  c              	     s$   ddd fdd}|S )zcWraps a pointwise fn and returns a function representing the pointwise in
    the define-by-run IR.N)rd  rU  rA   c              	     s
  d urt dd D rrJ  S tr_| d ur^| dkr^rQtd trQd  }d  jrQtjj	sQ|d urQ|j
dkrQtd d | S ttd | d< n| d u seJ dd D d  pyd   dd  D ]!}t|tjstt| ksJ d	 d
 d
|  qtjtjftjd uottjdd d uotjjjd uotjjjddoˈ v  
f	dd}	sd }D ]}t| j
r| } nq|sd  }	p|}tj| |dS )Nc                 s  "    | ]}t |tot|V  qd S r   rm   r8   r9   r   r   re   re   rf   r     r+  z0make_pointwise.<locals>.inner.<locals>.<genexpr>r-   r   cudac                 S     g | ]}|  qS re   ri  r   re   re   rf   r     r   z1make_pointwise.<locals>.inner.<locals>.<listcomp>zndim mismatch  r^   low_precision_pointwise_barrierFc                   s   t  t 	ksJ d  d	 tjkr&d ur& fddD  S g }tD ]'\}}| }|  }rN|v rNtj||dd}t||}|| q,| }rhtj|dd}t|S |S )Nzwrong ndim rr  c                   s   g | ]}| qS re   re   )r   loadrE  re   rf   r     r   zCmake_pointwise.<locals>.inner.<locals>.inner_fn.<locals>.<listcomp>F)use_compute_types)r   r~   r[   rl   r   rP   r  rq   )rE  inputs_loaded	inp_indexrt  rr   	inp_dtypedowncast)	r   emulate_output_castemulate_precision_castsrz   rU  loaders	low_pr_fpoverride_fn_when_input_boolrh  ru  rf   rf    s    $z/make_pointwise.<locals>.inner.<locals>.inner_fnrg  )r.  rY  rm   r8   rp   r   is_floating_pointr~   versionhipr   rj  rk   mulr   r0   BaseConstantr   rR  rS  rQ   r]   r   r^   metagetrG   r>   r   )rd  rU  
inp_deviceotherrf  rw   rt   allow_alpharz   override_devicer  rV  triton_fallbackuse_fma_for_alpha)r   r{  r|  rU  r}  r~  rh  rf   inner  sr   


zmake_pointwise.<locals>.inner)rU  rA   re   )rz   rV  r  r  r  r  r  r  re   r  rf   make_pointwise  s    Zr  rd  c                   s   dddd fdd}|S )Nr-   )rd  r  rU  list[list[TensorBox]]c           	        s  t |}dkr|rt|d t tfs|  n	dkr| n|  ttjjjdkp3tjjj	t
v p3t }d }|D ]}t|t tfrE|} nq8|d usNJ dg }|D ]}t|t tfsf||gt|  qR|| qRtt| } fdd}t|t|||S )Nr  ro  r   z1at least one input must be a list to a foreach opc                   s    r| i iS |  S r   re   r  )r  pw_fnscalar_kwarg
scalar_valre   rf   apply_fn2  s   z7make_foreach_pointwise.<locals>.inner.<locals>.apply_fn)rk   rm   r   popr   rQ   r]   r^   r_   ra   inplace_foreach_opsrg   rq   ry   r  foreach_group_loop)	rd  r  rU  realize_outputsa_list_inputinputbroadcast_inputsgroupsr  r  r  r  )r  rf   r    s@   

z%make_foreach_pointwise.<locals>.inner)rU  r  re   )r  r  r  r  re   r  rf   make_foreach_pointwise  s   4r  c                 C  s   dg| }|   D ]6\\}}}g }|D ]#\}	}
||
}|||	< tj|tjr6|r6|r6|  ||  q|r?tj	| q	t
dd |D sKJ |S )aa  
    Common loop over grouped foreach arguments.

    Args:
        groups: Result of group_foreach_args - dict mapping (device, use_foreach) to groups
        num_outputs: Number of outputs to produce
        apply_fn: Function to apply to each set of args, returns the output
        realize_outputs: Whether to realize outputs for foreach fusion
    Nc                 s      | ]}|d uV  qd S r   re   r   re   re   rf   r   Y  r  z%foreach_group_loop.<locals>.<genexpr>)r  rQ   r]   has_featureBackendFeatureFOREACHrealizerq   get_operation_nameregister_operation_listr-  )r  num_outputsr  r  outputsrw   rv   groupoperation_list
output_indru   r?  re   re   rf   r  =  s(   

r  Tcopyrv  c                   s@   |    kr|rt| S | S  fdd}t| d| S )Nc                   s:   t j|  d}tjtjf}s |v rt | }|S )N)	src_dtyperv  )rP   r  r~   rR  rS  )r   resultr~  r   r  rv  re   rf   	_to_dtyped  s   zto_dtype.<locals>._to_dtyperV  )r   cloner  )r   r   r  rv  r  re   r  rf   r  ]  s
   r  rZ  c                 O  s   ddl m} |}| j}||tjd}t| |j|  W d   n1 s'w   Y  |j}|s3J t|}dgt	| }	|
 D ]0\\}
}}g }|D ]\}}||	|< tj|
tjri|ri|  ||  qL|rrtj| qBtdd |	D s~J |	S )aI  
    This lowers an invocation of foreach_map
    The way this works is that an arbitrary N-arg func is provided by the user, looped over by the
    polyfill with the same semantics as a foreach op (a loop applying an n-ary function to n args)
    and then traced into a subgraph by dynamo.
    This code allows us to inline the subgraph into the main graph lowering using the PontwiseSubgraphLowering.
    The graph outputs represent the vertically fused sequence of ops, and then register_operation_list
    below registers the buffers as horizontally fuseable in the scheduler.
    r-   )PointwiseSubgraphLowering)root_graph_loweringNc                 s  r  r   re   r   re   re   rf   r     r  z_foreach_map.<locals>.<genexpr>)subgraph_loweringr  graph_modulerQ   r]   set_graph_handlerrungraph_outputsry   r   r  r  r  r  r  rq   r  r  r-  )subgraphru   r   r  rU  gmpw_subgraphsub_outputsr  r  rw   rv   r  r  r  r?  re   re   rf   _foreach_mapv  s6   r  c                 C  s   |j s|  j r&|  rt| |d}tj||  |S ttj	j
dd| |S |  }tjtjf}tjo:||v p:||v  }t| |d|dS )Nr   Fadd_to_fallback_setT)r  rv  )
is_complexr   r   
empty_liker0   InplaceCopyFallbackr   fallback_handlerprimsconvert_element_typedefaultr~   rR  rS  r.   r|  r  )r   r   dstr  r~  rv  re   re   rf   _convert_element_type  s    r  r  c                C  sb   |   }||kr|rt| S | S dd }||}||}||kr)ttjj| |S tt| |S )Nc                 S  s,   | j r	t| jS | tjkrdS t| jS )Nr   )r  r~   finfobitsr[   iinfor   re   re   rf   _get_primitive_bitwidth  s
   
z1to_dtype_bitcast.<locals>._get_primitive_bitwidth)	r   r  r  atenviewr   rA   r5   r   )r   r   r  x_dtyper  src_bitsdst_bitsre   re   rf   to_dtype_bitcast  s   	r  c                 C  s8   |j s|  j rttjtjjj	j
| |S t| |S r   )r  r   rA   r   r0   ComplexViewr~   rP   r  r  r   r  r   r   re   re   rf   _view_dtype  s
   
r  r  non_blockingc                C  s:   t |}|  |kr|rt| S | S ttj| ||S r   )rE   rp   r  rA   r   r0   r   )r   rw   r  r  re   re   rf   	to_device  s   r  c                 C  s   t | |d|dS )NTr  )r  )r   rw   r  re   re   rf   _device_put     r  c
                 C  s|   |p| j }t|}
t||| |durt|}t|
|||||	d}
t| |||d|
}
tt|r<ttt|d|d|
 |
S )z3A pointwise function that maps ops.{name} to inputsN)rV  r  r  r  r  )r   r   r   )r   r   )__name__r<   rL   r  r4  r   r  r   )r   r   r   r   r   rV  r  r  r  r  rz   re   re   rf   register_pointwise  s@   

r  ldexp)r   rV  )r   r   nc                   s   t d |  }| }|j}|j o|tjk}|r)|r) fdd}t|| |S t| r0tjn|fdd}t|d| |S )Nr  c                   s
    | |S r   re   )r   r  )ldexp_fnre   rf   compute_ldexp'     
z%ldexp_lowering.<locals>.compute_ldexpc                   s0   t | }t d }t ||}t | |S )Ng       @)rP   r  r\  powr  )r   r  
n_out_typetwo
pow_result	out_dtypere   rf   compute_fallback/  s   z(ldexp_lowering.<locals>.compute_fallbackr  )r<   r   r  r~   r[   r  r   float32)r   r  r  n_dtype
x_is_floatn_is_intr  r  re   )r  r  rf   ldexp_lowering  s    r  c                    sx   d} t d  fdd} fdd}t|t|tjdgfdd}ttj|}tt| r:tt	t| d	d
| |S )z2A pointwise function that maps ops.frexp to inputsfrexpc                        | i |d S Nr   re   ru   r   r  re   rf   frexp0@     zregister_frexp.<locals>.frexp0c                    r  Nr-   re   r  r  re   rf   frexp1C  r  zregister_frexp.<locals>.frexp1r  c                    s$    d | i | d | i |fS Nr   r-   re   r  )pw_fnsre   rf   rz   K  s   $zregister_frexp.<locals>.fnNrZ  )
r<   r  r~   int32r4  r  r  r   r  r   )r   r  r  rz   re   )r  r  rf   register_frexp;  s*   
r  c                 C  s   t |||d}t| |}|S )Nr  r  )r  r'  )r   pointwise_lowering_fnr  r  rz   re   re   rf   register_foreach_pointwise]  s
   
r  c                   s  dd }t |ttfrt||}t |ttfrt||}| ||g t d  d tjd}dd t D }t|t	 fdd|D  D ]\}}| |< qFt
t D ]}t  | tjrqt | t |d	    |< qUt||d
 d	 t d |t d |S )Nc                  W  
   t j|  S r   )rP   wherer  re   re   rf   rz   l  r  zwhere.<locals>.fnr-   r+   rZ  c                 S  r   re   r   r   re   re   rf   r   x  r   zwhere.<locals>.<listcomp>c                      g | ]} | qS re   re   r  r  re   rf   r   y  r   r   r  )rm   rB  r   constant_liker   r   rP  rl   r  r  r  r   r0   r  r6   r   rk   r   r  r  )r   r  r>  rz   r   indicesrt   r   re   r  rf   r   j  s&   
$
$
r   c                  G  s   t | dkrt| d ttfrt| d  S | S ttdd | D d}g }| D ]+}t|  }|kr4nt |t |ksHt	dd t
||D rMt||}|| q'|S )Nr-   r   c                 s      | ]}|  V  qd S r   )r   r   re   re   rf   r     r  z$broadcast_tensors.<locals>.<genexpr>re   c                 s  s0    | ]\}}t jj|t jj|kV  qd S r   )rQ   r]   r   r:  r   r  r>  re   re   rf   r     s    
)r   rm   rk   r   r  r"  reducerA  r   r.  r  r<  rq   )rU  ra   r  r   sizesre   re   rf   r    s"   
r  c                 C  rN  r   re   r   re   re   rf   nop     r  
lift_freshc                 C  s   t | tsJ |d u rtt| jS t |ttjfr"tj	j
|ntdd |D }tt|  |}tt |ts=|fn|}g }t|  D ]\}}||v r[tj	j
t|ds`|| qH||  krlt| |S | S )Nc                 s  s    | ]
}t jj|V  qd S r   rQ   r]   r   	guard_intr   dre   re   rf   r     r  zsqueeze.<locals>.<genexpr>r-   )rm   rA   r@   r   ro   r   r   r   rQ   r]   r   r  r   r   r   r   r%   rl   guard_or_falseEqrq   r  )r   r   dims	new_shaper  sre   re   rf   squeeze  s   
r  c                 C  s   t t| |S r   )r  r  )r   r   re   re   rf   squeeze_copy     r  c                 C  2   t | |}t| tsJ t|tsJ |j| _| S r   )r  rm   rA   ro   r   r   valre   re   rf   squeeze_  
   
r  c                 C  2   t | rt| dtjdS td}t|tjd| S )NFr   isinfr  r   	full_liker~   r[   r<   r  r   rz   re   re   rf   r       r  c                 C  r  )NFr   isnanr  r  r   re   re   rf   r"    r!  r"  c                 C  $   t | rt| S td}t|| S )Nceilr   r  r<   r  r   re   re   rf   r$       r$  c                 C  r#  )Nfloorr%  r   re   re   rf   r'    r&  r'  c                 C  r#  )Nroundr%  r   re   re   rf   r(    s   r(  c                 C  r#  )Ntruncr%  r   re   re   rf   r)    r&  r)  c                 C  s   t | g\} t| tjrt| t|S t| tsJ t|ttfs$J t| 	 t|kr0| S t
| 	 sWtjjt| 	 }|dkrWt
|sW| tjjt||  tt| jt|S r  )rY  rm   r0   r  r6   r   r   rA   rk   r   r"   rQ   r]   r   guarding_hint_or_throwrN   
mark_reusero   )r   r  x_size_productre   re   rf   r<    s$   
r<  c                 C  sL   t |}|D ]}d||< q| }t|D ]\}}|dkr t||}qt||S Nro  )rk   rl   	unsqueezer<  )r  shapebroadcast_dimensionsr  broadcast_dimensionr   r^  r   re   re   rf   broadcast_in_dim  s   


r2  c                 C  s   t | | S r   )r<  r   )r   r@  re   re   rf   	expand_as#  r  r3  c                   sb  t |   tt kr$tjjgtt      t| t  } tt|  ks0J t |  }d}ttD ]}| dkrHd}|| |  ||< q>|r`t|| 	 | 
 dS tdd t D rstt| |S  fdd}t st|stjjt }|dkr| tjjt||  |  tj| 
 | 	 |t |d	S )
NFr   Tr   rw   c                 s  s$    | ]\}}|d kp|d kV  qdS r-   Nre   r  re   re   rf   r   :     " zrepeat.<locals>.<genexpr>c                   st   t | t ks
J t| } tt D ]!}| dkr5 | dkr)tjj| |< qt| | d | | |< q| S r  )r   rk   r  r   r8  Zeror*   )rE  rt   old_sizerepeatsx_loaderre   rf   rf  ?  s   zrepeat.<locals>.inner_fnrg  )rk   r   r   r   r8  r9  r  r  emptyr   rp   r-  r  r  r<  r"   rQ   r]   r   r*  rN   r+  ri  r>   r   )r   r:  new_sizezero_tensorrt   rf  old_size_productre   r8  rf   repeat(  sB   r@  r  Sequence[sympy.Expr]c                 C  s   t t| j|S r   )rA   rC   r   ro   )r   r  re   re   rf   r  `  s   r  c                 C  s6   t | tsJ t |ttfsJ tt| jt|S r   )rm   rA   rk   r   r=   r   ro   )r   r  re   re   rf   permuteg  s   rB  c              	   C  s  ddl m}m} t| tsJ t| |d}|  | }t|}t|tj	s.|dks.J |z|dkrBt
jj||rB|dkrB| W S W n	 tyL   Y nw ddd}	d\}
}|}|r|	||d}
|durpt
jj|tjrp|}n|	|||}|
dur|dur|
|}}d}|st
jj}|t
jjj|dur|jd	i ni }|r| D ]1\}}||d
t|fkrt|||||}t
j||_t
j| q||dfkrtdqttjj | j!|||||dS |sJ |t
jjjt
jjjd	 }|dusJ t"|dksJ |d\}}| D ]\}}||d
t|fkr#|}q||dfkr-|}q|
du s;|du s;J t|||||  | }t
j||_t
j| |}| # du rb| $  |
dur||du snJ | % j&|
| ' |   }n%tj(||| % j&| ' | |  | dd}t
j||_t
j| |}t)|  }t)| ' }|||< ||  |9  < t*| |||S )a	  
    Lowers a slice call, creating ExternKernels for the output size & storage offset symbols,
    if the indices are unbacked and appropriate semantics aren't known.
    If they are known (indices are static/backed/unbacked with info), a SliceView is created.
    r   )CallMethodKeyr$   r-   Nc                 S  s   | d u r|S dd }t | } t |}|t t | dt | |r&| S |t t | dt | | r;| | S |t | |rE|S |t | | rPdS |t | dr^t | |S |t | drnt | | dS d S )Nc                 S  s   t jj| S r   )rQ   r]   r   r  r   re   re   rf   rL    s    z5slice_.<locals>.compute_slice_index.<locals>.<lambda>r   )	r   r<  AndGeLeLtGtMinMax)rE  r   r  rz   re   re   rf   compute_slice_index  s$   

 "z#slice_.<locals>.compute_slice_indexNNFunbacked_bindingsr   storage_offsetzWUnexpected storage_offset unbacked binding when both start and end indices are resolvedclampr+   Tr   )+%torch.fx.experimental.symbolic_shapesrC  r$   rm   rA   _validate_dimr   r   r<  r   rQ   r]   r   statically_known_leq	TypeErrorstatically_known_equalssysmaxsizer^   	shape_envr  r  r  pytreeSequenceKeyr0   DynamicSliceSizeregister_bufferr   register_operationr   	SliceViewr   ro   r   maybe_get_layoutr  
get_layoutoffset
get_strideDynamicSelectStorageOffsetrk   
as_strided)r   r   startendsteprP  rC  r$   r   rK  start_index	end_indexambiguous_slicer^   node_unbacked_bindingssymkeypathb_sizerM  sym_sizesym_storager=  new_storage_offset	b_storage	new_sizesnew_stridesre   re   rf   slice_p  s   


	



ru  c           	   	   C  s   d }d }t | trt | jtjr|  }| j}| j } |   t	| s-t
d|  dt| \}}t|r:|n|j|r@|n|jdd |D dd |D t|pSd}ttj||dS )Nzunrealized as_strided(z, ...)c                 S     g | ]}t |qS re   r   r<  r   r  re   re   rf   r   /      zas_strided.<locals>.<listcomp>c                 S  rv  re   rw  rx  re   re   rf   r   0  ry  r   ro   layout)rm   rA   ro   r0   r4   rp   r   unwrap_viewr  is_storage_and_layoutr   as_storage_and_layoutFixedLayoutrw   r   r<  r   )	r   r   striderN  
new_device	new_dtypestorage
old_layout
new_layoutre   re   rf   rd    s$   

rd  c                 C  s$   t | tsJ t| |||j| _| S r   )rm   rA   rd  ro   )r   r   r  rN  re   re   rf   as_strided_6  s   r  c                 C  s   t | |||}t|S r   )rd  r  )r   r   r  rN  r  re   re   rf   as_strided_copy=  s   r  c                   s   g d}D ]} |||    f d d }qdd D  fdd}td  }d d | < tjd  d  ||dS )Nr   ro  c                 S  rp  re   rq  rm  re   re   rf   r   K  r   z!pointwise_cat.<locals>.<listcomp>c           
   	     s@  t |  tj}g }g }ttD ]n  dkr t dtjn
t   d tj}t   d tj}t ||}t ||} dkrI|}n td krT|}nt 	||}|
| t| t   d  < |
t | fddd q|d }	ttd ddD ] t |  |  |	}	q|	S )Nr   r-   c                     s     S r   re   re   )rt   idx_loadinputs_loadersre   rf   rL  o      z1pointwise_cat.<locals>.inner_fn.<locals>.<lambda>        ro  r+   )rP   r[  r~   int64r  r   r\  geltand_rq   rk   r(   maskedr   )
r^  idx_dimmasksmasked_loadsre  rf  
start_condend_condmasknext_valr   rU  r  inputs_ranges)rt   r  rf   rf  M  sD   
zpointwise_cat.<locals>.inner_fnrg  )rq   r   rk   r>   r   rp   r   )rU  r   prev_endr   rf  r=  re   r  rf   pointwise_catC  s   0

r  r  scaleszero_pointsaxisr   	quant_min	quant_maxc              	     s   t  dksJ dt  dksJ d|  tjkr%t| tj} |  tjks5J d|    t |  k sHJ dt |   |     f	dd}tj	| 
 ||  dS )	Nr-   expect scales 1 dimexpect zero_points 1 dim<Expecting input to have dtype torch.float32, but got dtype: Expecting axis to be < c           
        s   |   f}| }|}|}t tjd\}}jtjkr(t|tj}jtjkr5t|tj}t|}t|| | }t	|t
||}	t|	S rG  )_create_constantsr~   r  r   rP   r  r  
reciprocalr(  maximumminimum)
r^  channel_idxr  scale
zero_pointqminqmax	inv_scaler  clamped	r  r   input_loaderr  r  r  scales_loaderr  zero_points_loaderre   rf   rf    s   

z;quantized_decomposed_quantize_per_channel.<locals>.inner_fnrg  )r   r   r   r~   rR  r  r  ri  r>   r   rp   )r  r  r  r  r  r  r   rf  re   r  rf   )quantized_decomposed_quantize_per_channel  s(   
r  c                   sP       t tj  fdd}tj    |t 	 d}|   |S )Nc                   sB   t j  t  | W  d    S 1 sw   Y  d S r   )r0   ComputedBufferforce_realizerP   device_assert_asyncri  ru  r   re   rf   rf    s   $z_assert_async.<locals>.inner_fnrg  )
r  r  r~   r[   r>   r   rp   r   rk   r   )r   r   rf  assertion_opre   r   rf   _assert_async  s   
r  c                 C  
   t | |S r   r  r   re   re   rf   lower_assert_async     
r  c                 C  r  r   r  r   re   re   rf   lower_assert_functional_async  r  r  r  r  torch.dtype | Nonec          	        s   t  dksJ dt  dksJ d|  |ks*J d| d|    t |  k s=J dt |   d u rDtj|     fdd}tj|  ||  d	S )
Nr-   r  r  Expecting input to have dtype , but got dtype: r  c                   s   |   f}| }|}|}j tjkrt|tj}j tjkr+t|tj}tt|tj|| }t|}|S r   )r   r~   r  rP   r  sub)r^  r  r  r  r  r  r  r  r  r  r  r  r  re   rf   rf    s   
z=quantized_decomposed_dequantize_per_channel.<locals>.inner_fnrg  	r   r   r   r~   r  ri  r>   r   rp   )	r  r  r  r  r  r  r   r  rf  re   r  rf   +quantized_decomposed_dequantize_per_channel  s(   r  r  rB  r  c                   s   |   tjkrt| tj} |   tjksJ d|    |   fdd}tj|   t	j
|t|t|d|  dS )Nr  c           	        sf   | }t d| |tjd\}}t|| | }t tjd\}}tt|||}t| S )N      ?r   )r  r~   r  rP   r(  r  r  r  )	r^  r  r  r  r  r  r  r  r  r   r  r  r  re   rf   rf    s   
zBquantized_decomposed_quantize_per_tensor_default.<locals>.inner_fnr  r  rg  )r   r~   rR  r  r  ri  r>   r   rp   r"  r3  rB  r   r   r  r  r  r  r  r   rf  re   r  rf   0quantized_decomposed_quantize_per_tensor_default  s   
r  c                  sv   |   |ksJ d| d|    d u rtj|    fdd}tj|  tj|t	|t
|d|  dS )Nr  r  c                   sF    | }t ||tjd\}}tt|tj|| }t|}|S rG  )r  r~   r  rP   r  r  )r^  r  r  r  r  r  r  re   rf   rf  I  s
   zDquantized_decomposed_dequantize_per_tensor_default.<locals>.inner_fnr  rg  )r   r~   r  ri  r>   r   rp   r"  r3  rB  r   r   r  r  r  r  r  r   r  rf  re   r  rf   2quantized_decomposed_dequantize_per_tensor_default3  s   r  c              	     s   |   tjkrt| tj} |   tjksJ d|    t dks9t dkr5 d dks9J dt dksUt dkrQ d dksUJ d|    |    f	dd}t	j
|  ||  dS )	Nr  r   r-   expect scale as scalar tensor"expect zero_point as scalar tensorc                   s   | }t  dkrdnd}t  dkrdnd}jtjkr-t|tj}jtjkr:t|tj} rQ jdkrQt|t	||}t
|S t|t	| | }ttjd\}}tt|||}t|S )Nr-   r   re   r   r   )r   r   r   r~   r  rP   r  r   r]  r  round_to_intr(  r  r  r  )r^  r  _scale_zero_pointr  r  r  r  	rw   r   r  r  r  r  scale_loaderr  zero_point_loaderre   rf   rf  w  s   zAquantized_decomposed_quantize_per_tensor_tensor.<locals>.inner_fnrg  )r   r~   rR  r  r  r   r   ri  rp   r>   r   r  re   r  rf   /quantized_decomposed_quantize_per_tensor_tensorZ  s0   ""r  c                  s   t  dkst  dkr d dksJ dt  dks8t  dkr4 d dks8J d|  |ksJJ d| d|   d u rQtj|      fdd}tj|  ||  d	S )
Nr   r-   r  r  r  r  c                   s    | }t  dkrdnd}t  dkrdnd}jtjkr-t|tj}jtjkr:t|tj}tt|tj|| }t|}|S )Nr-   r  re   )r   r   r   r~   r  rP   r  r  )r^  r  r  r  r  r  r  r  r  r  r  re   rf   rf    s   zCquantized_decomposed_dequantize_per_tensor_tensor.<locals>.inner_fnrg  r  r  re   r  rf   1quantized_decomposed_dequantize_per_tensor_tensor  s.   ""r  c                   s.  d   jdk}|r:tdd D r:D ]}|  qtdd D r1ttjgR  \}ttjj|S t	dkrFt
d S td |d}tdtjifdd	D d(dddd fddtfddD }d)fddtjrt|S |rttj|S fddd}d d*ddt	|kst	tjkrt fd dD rtfd!dtjjD }tfd"dD o|}d)fd#d$}	|	 }
tfd%dD otfd&dD  }|
s|s	|r|st|S ttj|S )+z@Lower aten.cat, choosing between pointwise_cat and ConcatKernel.r   r   c                 s  s$    | ]}|  tjtjfv V  qd S r   )r   r~   int8uint8r   r  re   re   rf   r     s    
zcat.<locals>.<genexpr>c                 s  s     | ]}t | d kV  qdS )r   N)r   r   r  re   re   rf   r         r-   r   c                   s   g | ]}t | qS re   r  rm  r   re   rf   r     ry  zcat.<locals>.<listcomp>r   TensorBox | ir.StorageBoxrZ   	ir.IRNodec                 S  s>   t | trt | jtjr| j S | jS t | tjr| jS | S r   )rm   rA   ro   r0   r4   r|  r   r   re   re   rf   unwrap_tensor  s   

zcat.<locals>.unwrap_tensorc                 S  s   t | tjot | jtjS r   )rm   r0   r  ro   r?   rx   re   re   rf   is_reduction     zcat.<locals>.is_reductionc                   sJ   t | ttjfr | S | p$t | tjo$t fdd|  D S )Nc                 3  s     | ]} t j|V  qd S r   )rQ   r]   
get_buffer)r   readcan_fuse_reductionre   rf   r     s
    
z2cat.<locals>.can_fuse_reduction.<locals>.<genexpr>)rm   rA   r0   r   r>   r.  get_read_namesr  )r  r  r  re   rf   r    s   zcat.<locals>.can_fuse_reductionc                 3      | ]} |V  qd S r   re   r   rx   r  re   rf   r     r  r[   c                   sZ   t | rt j| dd\}}t j| S t| tt jfr# | S t| t jr+dS dS )NF)freezeT)	r0   r}  r~  ConcatKernelcan_realize_into_without_copyrm   rA   r   r>   )r   r  _)should_lower_cat_inputr  re   rf   r    s   
z#cat.<locals>.should_lower_cat_inputc                   s\   t | ttjfr | S t | tjsdS |  j}|  D ]}| tj	
|7 }q|S r  )rm   rA   r0   r   r>   inner_fn_opcountnum_opsr  rQ   r]   r  )r   countr  )op_countr  re   rf   r    s   
zcat.<locals>.op_countr   r+   r`   r  c                 S  s   | t jjt jjfv S r   )r  catr  constant_pad_ndr`   re   re   rf   additional_pointwise_ops#     z%cat.<locals>.additional_pointwise_opsc                 3  s    | ]	}| kV  qd S r   re   r  )MAX_SIMPLE_OP_COUNTr  re   rf   r   (      c                 3  s    | ]}t | V  qd S r   rH   )r   use)r  re   rf   r   *  s
    
c                 3  r  r   re   rm  r  re   rf   r   1  r  c                    s   t j  d u r	dS  jd } t| ttfr| }nt| tjjr#| g}ndS fddt	|D ]'\}}t
|drAt|jdkrBq1t fdd|jD rQ d	S |rX d	S q1dS )
NFr   c                   s(   t | ttjfr | S t | tjS r   )rm   rA   r0   r   r>   r   )is_unrealized_pointwiser  re   rf   r  G  s   zKcat.<locals>.any_input_has_multi_consumers.<locals>.is_unrealized_pointwiser_   r-   c                 3  s     | ]}| urt |V  qd S r   r  )r   u)r^   re   rf   r   Q  r  z=cat.<locals>.any_input_has_multi_consumers.<locals>.<genexpr>T)rQ   r^   ru   rm   rk   r   r~   fxNoder  r   r   r_   r.  )fx_argsinput_nodesr   ir_input)rU  r  )r^   r  rf   any_input_has_multi_consumers;  s&   
z*cat.<locals>.any_input_has_multi_consumersc                 3  r  r   re   rm  r  re   rf   r   \      
c                 3  r  r   re   r  r  re   rf   r   ^  r  N)r   r  rZ   r  rZ   r[   r`   r  )rp   r   r-  r  require_channels_lastr  r  r  r  r   r  rR  r   r   rP  r.  r.   force_pointwise_catr  rA   r0   r  r   max_pointwise_cat_inputsrQ   r^   r_   )rU  r   
cpu_devicer  r  fusable_reductionMAX_COMPLEX_POINTWISE_CATpointwise_usesfuse_pointwise_user  has_multi_consumershorizontal_fuse_catre   )	r  r  r  r   rU  r  r  r  r  rf   r    sp   




r  ra  dim1dim2c                   s  |   ttdtdtkfdd tjjt	|d}|rBtjj
tjj |  d}ntjj
tjj  | d}d |r`| df nd|f fddtD }||  fdd	}ttj| ||S )
N)r^  rankc                     s   d  d S )Nz(diagonal dimensions cannot be identical z, re   re   r  r  re   rf   rL  o      zdiagonal.<locals>.<lambda>r   )r   r   c                   s    g | ]\}}| fvr|qS re   re   )r   rt   r  r  re   rf   r          zdiagonal.<locals>.<listcomp>c                   s   | d }dgt  }d}tD ]&}|kr | d  ||< q|kr-| d  ||< q| | ||< |d7 }q|t d ksBJ |S )Nro  r   r-   r+   )r   r  )r^  diag_idxoriginal_idxcur_dimr  base_idxr  r  num_dimsoriginal_shapere   rf   	reindexer  s   
zdiagonal.<locals>.reindexer)r   r   r   r   rQ   r]   r   evaluate_exprr   rG  evaluate_maxevaluate_minrl   rq   rA   r0   GenericViewr   )r  ra  r  r  offset_negative	diag_sizer  r!  re   r  rf   diagonalg  s>   


r(  c                 C  s   t t| |||S r   )r  r(  )r  ra  r  r  re   re   rf   diagonal_copy     r)  c                 C  $   t | }t||||}t|| |S r   )r  r(  	mutate_to)r  srcra  r  r  r?  ra   re   re   rf   diagonal_scatter     
r.  c                 C  s  t |}t |  | }d }tjjt |dr || }ntjjt |dr-|}|d urit	|rY| 
  |  }|  }|  j|| |  }||= ||= t| |||S t| |||d dd}t||S ttjjjtjjjd }	|	d us|J t|	dksJ |	tt|	 \}
}| 
  |  }|  }|
}tj|
||  j|| |  | dd}tj||_tj| ||= ||= t| |||S )Nr   r-   FrO  rM  )r   r<  r   rQ   r]   r   r  rG  rE  r#   r  rb  r`  ra  rd  ru  r  r$   rX  r^   r  r   rQ  iterr  r0   rc  r\  r   r]  )r   r   r^  r   actual_indexr=  
new_striderq  slice_resultrM  unbacked_offset_symr  bufferre   re   rf   select  sR   



r6  c           
   
   C  s   t | |d}|}t|ttfs2|  | }tjjt	|| d |}|g| }||d |  |d< g }d}|D ]}|| }	|
t| |||	dd |	}q8|S )Nr   r-   ro  FrO  )rR  rm   rk   r   r   rQ   r]   r   r  r'   rq   ru  )
r   r  r   sizes_x_sizechunksr  re  r   rf  re   re   rf   split  s   
r:  c                 C  s   t | ||S r   )r:  )r   r  r   re   re   rf   split_with_sizes		     r;  c                   s>   t  d tjj   } fddt|D }|S )Nr   c                   s   g | ]}t  |qS re   )r6  r  r   r   re   rf   r   	      zunbind.<locals>.<listcomp>)rR  rQ   r]   r   r  r   r  )r   r   r8  r  re   r=  rf   unbind	  s   r?  c                   s   |   }t|}t|| |dkrtt| d|ddS |  }tjj}||| |	d t
|| d }||dkrL| |t|| | g |d   || d d  |}	 fdd}
ttj| |	|
S )Nr   F)rf  rP  r-   c                   s:   | d |     }g | d   ||  d d R S )Nro  r-   re   )r^  dim_idxr   rg  re   rf   r!  ,	  s   &zunfold.<locals>.reindexer)r   r   r   ru  r.  rQ   r]   r   	check_leqcheck_ltr'   r*  r+  r&   rA   r0   r%  r   )r   	dimensionr   rg  r  ndimdim_sizer   new_dim_sizeout_sizer!  re   rA  rf   unfold	  s"   
(rI  c                 C  s2   t | |d}t|  }||tjj t| |S r  )rR  rk   r   insertr   r8  r9  r  )r   r   r  re   re   rf   r.  3	  s   
r.  c                 C  r  r   )r.  rm   rA   ro   r  re   re   rf   
unsqueeze_;	  r  rK  c                 C  sZ   t jjjt|}t|  }|dk r||| 7 }d|  kr(|| k s+J  J |S r  )	rQ   r]   r   rX  r"  r   sympifyr   r   )r   r   ra  rE  re   re   rf   rR  D	  s    rR  ro  c                 C  s\   t | |d}tjj|  | d }t| |d|dd}t| |||d dd}t|t|S )Nr   r+   FrO  )	rR  rQ   r]   r   r  r   ru  r  sigmoid)r   r   new_lenr  r>  re   re   rf   gluM	  s
   rO  c                   s$   |rt    fdd}d|_|S )Nc                    s*   dd }t |tjj g| R i |S )Nc                 S     t | tjr
|  S | S r   rm   r0   r8   wrap_for_loweringr   re   re   rf   wrap_tensors]	     z7fallback_handler.<locals>.handler.<locals>.wrap_tensors)rY  tree_mapr0   FallbackKernelr   )ru   r   rS  kernelre   rf   handler\	  s   z!fallback_handler.<locals>.handlerT)r*  r   _is_fallback_handler)rX  r  rY  re   rW  rf   r  X	  s
   
	r  c                   C     t d d S )NzjTorchinductor does not support code generation for complex operators. Performance may be worse than eager.)warningswarnre   re   re   rf   _warn_complex_not_supportedj	  s   r^  rx   torch.Tensorc                 C  s   |   r	t  dS | jrdS | jrdS | jtjkrD|sdS t|jtj	j
r6|jtjjtjjtjjtjjfv pBt|jtj	j
oBt|j S dS )z0Do not support reading or writing to this tensorTF)r  r^  is_meta	is_sparser   r~   float8_e8m0fnurm   ra   r   r   r  r  r  r  r  
_scaled_mmrI   )rx   rc   re   re   rf   unsupported_input_tensors	  s,   rd  c                 C  sL   t jjtjjjjf}|dur|j|v r| 	 rdS t
| |r dS | jo%tjS )z2Do not support writing tensor but can read from itNFT)r  r  r   r~   rP   r  r  r  ra   r  rd  is_cpur.   disable_cpp_codegen)rx   rc   supported_complex_viewsre   re   rf   unsupported_output_tensor	  s   

rh  rc   torch.fx.Nodec                   sv    j tjju r	dS  jdkrdS  j tjju rdS  fdd}tj ji  j	D ]}||ddr4 dS q)| ddS )NFplaceholderc                   sp   t | tjjs	dS d| jvrdS t| jd D ]}t |tjjs"q|r-t	| r, dS qt
| r5 dS qdS )NFr  T)rm   r~   r  r  r  rY  tree_leaves_subclasses
FakeTensorrh  rd  )inp_out_node	is_outputr  rc   re   rf   check_skip_condition	  s   


zCfallback_node_due_to_unsupported_type.<locals>.check_skip_condition)ro  T)
ra   r  view_as_complexr  r`   lift_fresh_copyrY  arg_tree_leavesru   r   )rc   allow_cpu_inputsrq  r   re   rp  rf   %fallback_node_due_to_unsupported_type	  s   
rv  c           	        s  |d ur| nt }| |vs|sJ d|  |rGttdrGt| grGtjr.| tjj	j
v sG|sGtjjjr?dtjj_td td|  d fdd}t| tjjrf|  D ]}t| |}|| qXd S t| tjjtjjfrw||  d S td	|  d
t|  )Nz*both a fallback and a decomp for same op: CIFznA make_fallback error occurred in suppress_errors config, and suppress_errors is being disabled to surface it.zmake_fallback(a.  ): a decomposition exists, we should switch to it. To fix this error, either add a decomposition to core_aten_decompositions (preferred) or inductor_decompositions, and delete the corresponding `make_fallback` line. Get help from the inductor team if unsure, don't pick arbitrarily to unblock yourself.c                   s.   t |   d urt|   t| d dt| S NrZ  )r   r   r4  r  )op_overloadlayout_constraintre   rf   register_fallback	  s   

z(make_fallback.<locals>.register_fallbackzUnsupported fallback z with type )r2   r[   osgetenvr3   r.   fallback_randomr~   _decompdecompositions_for_rngextra_random_decomps_dynamosuppress_errorslogwarningr   rm   r   r   r   r   r   HigherOrderOperatorRuntimeErrorr   )	r`   r{  r]  override_decompget_decomp_fncheck_decompsr|  olry  re   rz  rf   make_fallback	  s@   




r  c                 C  s$   d}| D ]}|| }qt |tjdS )z
    TorchInductor offset calculation differs from PyTorch eager offset
    calculation for random ops (tl.rand vs torch.rand). In future, we should
    strive for same impl for tl.rand and torch.rand.
    r-   r   rH  r~   r  )r/  numelr  re   re   rf   philox_rand_offset
  s   
r  c           	        sd   t | | t j|  | |  fdd}tj| |t| d}t	| }||fS )Nc                   sV   t g tj}t g tj}t t | tj|}t ||}t | S r   )rP   r  r~   r  r   r[  rand)rE  seed_index_exproffset_index_exprrand_index_exprr  r   offset_loader
random_posseed_loaderre   rf   rf  
  s   zphilox_rand.<locals>.inner_fnrg  )
r0   r  FlexibleLayoutcontiguous_stridesmake_indexerri  r>   r   rk   r  )	r   seedra  r  rw   r   rf  random_values_nodeoffset_nodere   r  rf   philox_rand
  s&   
r  c              	   C  s.   t jrttjtjtj	j
| ||S td)Nz&should be handled in replace_random.py)r.   r  rY  rU  rA   r   r0   rV  r  native_dropoutr  r   )r   ptrainre   re   rf   r  6
  s   r  c                 G  sj   t js|  tdksJ d|   t|dks!t|d tr%t	j
jnt	j
j}tj|| g|R   | S )Nr   Tthis should be handled in decomps unless config.fallback_random or the device is CPUr   )r.   r  rp   r~   rw   r  r   rm   rB  r  
bernoulli_Tensorr0   InplaceBernoulliFallback)r   ru   ry  re   re   rf   r  A
  s   r  c                 G  s4   t js|  tdksJ dtt| g|R  S )Nr   r  )r.   r  rp   r~   rw   r  r  )r   ru   re   re   rf   bernoulli_pP
  s   r  c                 C  s   t r   r   r  re   re   rf   _foobarY
  r	  r  c                 C  r[  )Nz1using triton random, expect difference from eager)r  info)saltre   re   rf   _warn_triton_random^
  r  r  c                   C  s   t tjj d S r   )r  rQ   r]   creation_timere   re   re   rf   warn_triton_randomc
  r  r  )r  c                  O  D   | dd urt| i |S tjr|dd  t| i |S tdN	generatorz-should have been handled in replace_random.py)r  fallback_rand_generatorr.   r  r  fallback_rand_defaultr   r  re   re   rf   r  x
     r  c                  O  r  r  )r  fallback_randn_generatorr.   r  r  fallback_randn_defaultr   r  re   re   rf   randn
  r  r  c                 C  s   t |}t j| |S r   )r0   get_stride_orderExternKernelrequire_stride_order)input_tensorr  stride_orderre   re   rf   inductor_force_stride_order
  s   
r  c                 C     t d)Nz.should be handled in fuse_seed_creation_pass()r  )rw   re   re   rf   inductor_seed
     r  c                 C  s   t   tt| t|S r   )r  rA   r   r0   RandomSeedsrE   )r  rw   re   re   rf   inductor_seeds
  s   r  c                   s(    fdd}t j  |g dS )Nc                   s   t   S r   )rP   	load_seedget_namer  rE  seedsre   rf   rf  
     z&inductor_lookup_seed.<locals>.inner_fnrg  )r>   r   rp   r   )r  rE  rf  re   r  rf   inductor_lookup_seed
  s   r  c                 C  s`   t | tjst| } | jdkr*| j}|d u rtj }tj|}|j|j	 }|S d}|}|S )Nrn  i   )
rm   r~   rw   r   rE  rn  current_deviceget_device_propertiesmulti_processor_countmax_threads_per_multi_processor)rw   r^  propthreads_per_round_CPU_GRAIN_SIZEre   re   rf   get_threads_per_round
  s   



r  )ra  align_dtyper   	list[int]r  moder  c          
        s   t jrJ  dv sJ g | } tj}| }tj||| tj| |d	 |
 t jrI|jdkrIt|ddd	}||fd
d}n fdd}tj|||g | d}	|	  |	S )N)r  r  ra  rn  dtr   rZ   r   c                 S  s   | t jt jfv r
dS dS )Nr   r   )r~   rS  rR  )r  re   re   rf   _vec_from_dtype
  s   z(inductor_random.<locals>._vec_from_dtypec                   s:   dg}dg}t j||t  | tjtdS )Nr   r-   )vec)rP   
rand_eagerr[  r~   r  r   )rE  rng_seedbase_offset)r  r  r  r  re   rf   rf  
  s   

z!inductor_random.<locals>.inner_fnc                   s"   t t g t| tjS r   )r   rP   r[  r~   r  ru  )r  r  r  re   rf   rf  
  s   rg  )r  r   rZ   r   )r.   r  r~   r  rT  r0   r  r  r  r  ri  align_random_eagerr   r  r>   r   r  )
r   r  r  ra  r  r   rw   r  rf  r  re   )r  r  r  r  r  rf   inductor_random
  s2   
	
r  r  lowhighc                  sp   t jrJ g |}tj}| }tj|||tj||d	 |
  fdd}tj|||g |dS )Nr  c              	     s6   t g t | tjt tjt  tjS r   )rP   	randint64r[  r~   r  r  ru  r  r  r  r  re   rf   rf    s   z"inductor_randint.<locals>.inner_fnrg  )r.   r  r~   r  rT  r0   r  r  r  r  ri  r>   r   )r  r  r   r  ra  r   rw   rf  re   r  rf   inductor_randint
  s"   
r  tb.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]c                 C  sH   |   }|  }tdd t||D |d  }|  |d ||d fS )Nc                 s  s     | ]\}}|d  | V  qdS r5  re   )r   r  stre   re   rf   r     r  z%_boundaries_helper.<locals>.<genexpr>ro  )r   rb  sumr  r  )r  r   r  
max_offsetre   re   rf   _boundaries_helper  s    r  tuple[str, sympy.Expr]c                 C  s   |   |  d fS r-  )r  rb  r  re   re   rf   _sorter_helper(  r  r  	out_int32rightsidesortersorted_sequenceselfr  r  r  
str | Noner  TensorBox | Nonec          
        s   dd }|r||rd ur$|s$t tjjdd|||dS |d ur.|dkr.d|r3tjntj |   d urF  t	
 dkrY fd	d
}n
 fdd
}| }tj| ||jd}	|	  |	S )Nc                 S  s   t j| tjS r   )rQ   r]   r  r  	BUCKETIZEr  re   re   rf   rL  6  s    zsearchsorted.<locals>.<lambda>Fr  r  r  Tr-   c              	     sD   | }t j|td d u rd ntd u rd dS ddS )Nr   r  sorter_indicesrP   	bucketizer  r  )r^  r  index_dtyper  r  r  values_loaderre   rf   rf  [  s   
zsearchsorted.<locals>.inner_fnc              	     s\    }d fdd}t j|t|d u rd ntd u r(d dS |dS )Nr  rA   c                   s>   |   }tttjdd t|d d  d d D S )Nc                 s  s    | ]	\}}|| V  qd S r   re   )r   r  rt   re   re   rf   r   r  r  zNsearchsorted.<locals>.inner_fn.<locals>.get_flattened_index.<locals>.<genexpr>ro  )rb  rP   r[  r"  r  operatorr   r  )r  strides)r^  r  re   rf   get_flattened_indexn  s   &z;searchsorted.<locals>.inner_fn.<locals>.get_flattened_indexr  )r  rA   r  )r^  r  r  r  r^  rf   rf  i  s   	
rg  )r  r  searchsortedr  r~   r  r  ri  r  r   r   rp   r>   r   r/  )
r  r  r  r  r  r  validate_bucketizerf  rw   r  re   r  rf   r  ,  sB   
r  r  r  
boundariesc                  s   t   dks
J tj| tjrtj tjs(ttj	j
dd|  |dS    |  }|  |r9tjntj fdd}tj|||  d}|  |S )Nr-   Fr  r  c                   s"   | }t |t d}|S r  )rP   r  r  )rE  r  r  r  r  r  r  re   rf   rf    s   zbucketize.<locals>.inner_fnrg  )r   r   rQ   r]   r  r  r  r  r  r  r  r  rp   ri  r~   r  r  r>   r   )r  r  r  r  rw   rf  r  re   r  rf   r    s*   
r  c                 C  s    t | tjot | tjtjf S r   )rm   r0   r8   NonTensorObjOpaqueMultiOutputr   re   re   rf   _is_tensor_irnode  s   r  c                 O  "   t ttjj||f\}}||fS r   )rY  tree_map_onlyr  r0   r  require_stride1r  ru   r   re   re   rf   require_dense  s   r  c                 O  r	  r   )rY  r
  r  r0   r  require_contiguousr  re   re   rf   r       r  c                 O  r	  r   )rY  r
  r  r0   r  r   r  re   re   rf   r     s   r   c                 O  r	  r   )rY  r
  r  r0   r  r
  r  re   re   rf   r
    r  r
  c                   s   d u r S t tstr S t  tjr tj  S t  tr/ fdd D S t  t	t
frDt dd t D S  S )Nc                   s    i | ]}|t  | | qS re   constrain_to_fake_tensorr   keyr   fake_argre   rf   r	    r  z,constrain_to_fake_tensor.<locals>.<dictcomp>c                 s      | ]
\}}t ||V  qd S r   r  )r   r  f_are   re   rf   r         
z+constrain_to_fake_tensor.<locals>.<genexpr>)rm   r   r   r0   r8   r  require_exact_stridesr  r$  r   rk   r   r  r  re   r  rf   r    s   
r  c                   s6   t dd t| |D }  fdd| D }| |fS )Nc                 s  r  r   r  )r   r   r  re   re   rf   r     s
    
z,constrain_to_fake_tensors.<locals>.<genexpr>c                   s    i | ]\}}|t | | qS re   r  r   fake_kwargsre   rf   r	  
  r  z-constrain_to_fake_tensors.<locals>.<dictcomp>)r   r  r  )ru   r   	fake_argsr  re   r  rf   r     s
   r   c                   sJ    fdd t  fddt|jD } fdd| D }||fS )Nc                   sl   t  r$jd}t|tjs S t| t	j
jj}tj |S t tr4 fdd D S  S )Nr  c                   s    i | ]}| | | qS re   re   r  )apply_constraintr   fx_argre   rf   r	    r  zEconstrain_to_fx_strides.<locals>.apply_constraint.<locals>.<dictcomp>)r  r  r  rm   r~   r  r0   r  r  rQ   r]   r   rX  r  r  r$  )r   r  fake_valr  r  )r   r  rf   r    s   
z1constrain_to_fx_strides.<locals>.apply_constraintc                 3  s    | ]
\}} ||V  qd S r   re   )r   r   r  r   re   rf   r     r  z*constrain_to_fx_strides.<locals>.<genexpr>c                   s"   i | ]\}}| |j | qS re   r  r   r  fx_nodere   rf   r	    s   " z+constrain_to_fx_strides.<locals>.<dictcomp>)r   r  ru   r  r"  ru   r   re   r!  rf   r     s   
r   c                   s\    fddfdd t fddtt|jD }fdd| D }||fS )	zGApply stride constraints to SDPA inputs, ensuring dense last dimension.c           
        s   t |s|S |jd }dd | D }tjjj}t| |}|r5|d dkr5t	t
tt| }jtjjkrJ| dv rJt|dksHJ d}d }tjrl| }t|||r]t|nd f}|tjjv rltjj| S  | ||||}	|d ur~|	tjj|< |	S )	Nr  c                 S  s$   g | ]}t |tjr|jjn|qS re   )rm   r~   SymIntrc   exprrx  re   re   rf   r   1  s    z=sdpa_constraint.<locals>.apply_constraint.<locals>.<listcomp>ro  r   )r   r   r   )r   r-   r+   r   )r  r  r  rQ   r]   r   rX  r0   r  rk   r7  r  r   r   ra   r  0_scaled_dot_product_efficient_attention_backwardr  r.   cache_sdpa_constraintmaybe_get_nameidr   sdpa_constraint_cache)
r^  r   r  meta_valmeta_stride_exprrX  r  	cache_keyarg_namer  )_apply_constraint_innerr"  re   rf   r  ,  s>   


z)sdpa_constraint.<locals>.apply_constraintc                   s|  |j s|jstj||S d jtjjj	j
ko| dk}t|ts#J t| dvr-|S t| }|r?ttj||S t|trV| d urV|rVttj||S |rt| }g }| }	tt| d D ]}
tjj||
 ds|	d urtjj|	|
 dr||
 qndgt| }d|d< d}tt|d ddD ]@}
||
d  dkr|||
d   }|
|v rtjjt||
d   drd||
< qtjjt| dst|   }|||
< qtj||S |rttj||S t|tr| d ur|rttj||S  fdd	}t|jtjr7||s7||  r7ttj||S tj||S )
Nr   r   r   r   r-   r   ro  r+   c                   s$   t jjtt|  d  dS )Nro  r   )rQ   r]   r   r  r   r  r)   r   r   	ALIGNMENTre   rf   
is_aligned  s   zDsdpa_constraint.<locals>._apply_constraint_inner.<locals>.is_aligned)!is_cudais_xpur0   r  r  ra   r~   rP   r  '_scaled_dot_product_efficient_attentionr  rm   rA   r   r   is_aligned_realized_tensortry_match_insignificant_stridesrealize_inputr8   maybe_get_striderk   r  rQ   r]   r   rU  rq   r)   rD   r  ro   r4   r|  )r^  r   r+  r,  r  effn_attn_fwd_biasis_aligned_tensorrH  expanded_dimsmaybe_stridert   out_stridesr  r3  )r"  r1  rf   r/  _  s   




z0sdpa_constraint.<locals>._apply_constraint_innerc                 3  s$    | ]\}\}} |||V  qd S r   re   )r   r^  r   r  r   re   rf   r     s
    


z"sdpa_constraint.<locals>.<genexpr>c                   s$   i | ]\}}| d |j | qS ro  r  r   r!  re   rf   r	    s   $ z#sdpa_constraint.<locals>.<dictcomp>)r   rl   r  ru   r  r#  re   )r/  r  r"  rf   sdpa_constraint)  s   3frA  )r]  c                 C  s   t |tjst||  |  d}|}|  | kr"t||  }|  | kr1t||  }|  | krDt	||  }t
|S t
|S Nr4  )rm   r0   r8   rH  r   rp   r  r  r   r<  r  )r  r-  r  r   rr   re   re   rf   r    s   )memory_formatc                C  s&   t j|  |  |  t|  dS Nrg  )r>   r   rp   r   ri  rk   r   )r   rC  re   re   rf   r    s   
r  c                 C  s   g }t | tr+t | jtjr+| j} t | tjr'||   | j} t | tjst| } t| } |rI| j} |d d d D ]	}tj| |d} q;t| } | S )Nro  rz  )rm   rA   ro   r0   r   rq   r`  r  )r   reinterpret_view_layoutsr{  re   re   rf   clone_preserve_reinterpret_view  s   rF  rs  c                  s(    fdd}t jt| || gdS )Nc                   s   t j| d    dS )Nr   r   rP   r[  ru  r   re  rg  re   rf   rz     r  ziota.<locals>.fnrg  )r>   r   rE   )lengthre  rg  r   rw   requires_gradrz   re   rH  rf   iota  s   
rK  )r   rw   r{  
pin_memoryrJ  c          	      C  s@   |d usJ t ||  |}t|| |||d ur||dS d|dS )Nr   re  rg  r   rw   rJ  )rD   rK  )	re  rf  rg  r   rw   r{  rL  rJ  rI  re   re   rf   arange_start_step  s   
rN  r   rE  c                   s   t ||  }|  t|  d tjjt	dr%| 
    ntjjtdr1nttjj| | S tjjd tjj| 
    tt| | 
 }|  fdd}tj|  |  |t| 
 dS )Nr   c              	     s6   t t t |   tjt tj| | S r   )rP   r   eqr[  r~   r  r   r   rE  
src_loaderr;  re   rf   rf    s   z select_scatter.<locals>.inner_fnrg  )r  r   ri  rR  rQ   r]   r   r  r   rG  r   rE  r  r  select_scatterr  rB  rC  r<  r.  r>   r   rp   rk   )r   r-  r   rE  rf  re   rP  rf   rR    s&   

rR  c                   s   t | } t d    tj \t }t	 d  | < t
||}|  fdd}tj  |t dS )Nr   r-   c              
     s2  dkrkrdkr| S t |  tj}t|  t|    < g }dkr?|t |t t	tj krT|t 
|t t	tj dkrs|t t t|   dtjt dtj |swJ tt j|}t | fddtrdnd}t ||| S )Nr   r-   c                         S r   re   re   )src_idxrQ  re   rf   rL  c      z1slice_scatter.<locals>.inner_fn.<locals>.<lambda>r  )rP   r[  r~   r  rk   r'   rq   r  r   r<  r  rO  r*   r\  r"  r  r  r  r   r   )r^  r  r  src_valr   rF  rf  rQ  re  rg  r   r;  )rT  rf   rf  >  sR   zslice_scatter.<locals>.inner_fnrg  )r  r   ri  rR  r   r0   r^  normalize_start_endrk   r'   r<  r>   r   rp   )r   r-  r   re  rf  rg  src_sizerf  re   rW  rf   slice_scatter/  s    
.
rZ  c                 C  s*   t | ttfrt| dkrt| d S | S r  )rm   rk   r   r   _unwrapr   re   re   rf   r[  t  s   r[  r   rw   r{  rL  c                  s  t |d tjfv d|  t | d tt tr ptjnp%t g }t tj	r6 fdd}nBt t
tfrE fdd}n3t dksZt d t
tfrlt dkrl|tt   fdd}ntjtj |d	S tjt|||d
S )Nlayout=rL  c                      t  S r   rG  ru  ro   r   re   rf   rf       ztensor.<locals>.inner_fnc                   r^  r   rP   r\  ru  r_  re   rf   rf    r`  r   r   c                   s8    fdd t dkrtdS  dt S )Nc              	     sr   | |k sJ ||  dkrt |  S ||  d |  }t t t d tjt |tj | | ||S )Nr-   r+   r   )rP   r\  r   r  r[  r~   r  )re  rf  mid)binary_searchro   r   rE  re   rf   rc    s   z/tensor.<locals>.inner_fn.<locals>.binary_searchr   )r   rP   r\  ru  r_  )rc  rE  rf   rf    s   r4  rg  )r   r~   stridedrm   r[  r   r  get_default_dtyper   r   rB  r   rq   IntegerrQ   r]   add_tensor_constantrH  r>   r   rE   )ro   r   rw   r{  rL  rh  rf  re   r_  rf   rH  z  s,   *rH  c                 C  s@   t | tr|d urt| |} |d urt| |} | S t| ||dS rB  )rm   rA   r  r  rH  )ro   r   rw   re   re   rf   	as_tensor  s   


rh  c                 C  s   t | tjdS rG  r  ro   re   re   rf   long_tensor  r  rj  c                 C  s   t tjjjtjjjd }|d usJ t|dksJ |tt	|
 \}}t||| }tj||_tj| tjjjd }t|tjtjtjfrQ|jjS t|S )NrM  r-   r  )r$   rQ   r]   r   rX  r^   r  r   rQ  r0  r  r0   DynamicScalarr\  r   r]  rm   r~   r$  SymFloatSymBoolrc   r%  r   rL  )ro   rM  binding_symrm  r5  r  re   re   rf   _local_scalar_dense  s   
ro  c                 C     d S r   re   )ro   r   re   re   rf   _assert_scalar  s   rq  )rw   r{  c                C  rp  r   re   )r  r   r  r   rw   r{  re   re   rf   _assert_tensor_metadata  s   rr  c                   s   | t | ttfstdrjt ttfr  fdd}n"t tjr. fdd}nt dks8J 	 fdd}t
j| |t|dS )Nr  c                      t  S r   ra  ru  r   r  re   rf   rf    r`  z_full.<locals>.inner_fnc                   rs  r   rG  ru  rt  re   rf   rf    r`  r   c                   s    g S r   re   ru  )value_loaderre   rf   rf    s   rg  )rm   r   rB  r   r  r   r   r   r   ri  r>   r   rk   )
fill_valuerw   r   r   rf  re   )r   r  ru  rf   _full  s    rw  c                 K  s   t t|| fi |S r   create_tensor_liketensor_constructor)r   rv  r   re   re   rf   r    rT  r  c                   s    d d d d dd d fdd
}|S )NF)namesr   rw   r{  rL  rC  c                   s   t | d u d t |d tjfv d|  t | d t|}|p#t }t|dkr;t|d tttj	fr;t|d }|D ]
}t|tj
rGJ q=dd |D }t |||S )Nnamed tensorsr]  rL  r-   r   c                 S  rv  re   rw  rx  re   re   rf   r   9  ry  z5tensor_constructor.<locals>.inner.<locals>.<listcomp>)r   r~   rd  rE   re  r   rm   rk   r   Sizer$  rw  )r{  r   rw   r{  rL  rC  r   r  rv  re   rf   r  %  s   	"z!tensor_constructor.<locals>.innerre   )rv  r  re   r~  rf   rz  #  s   rz  )r{  r   r{  rw   rL  rC  c                 G  sX   t | d u d t|}t|dkr"t|d tttjfr"t|d }t|d ||||dS )Nr|  r-   r   r   r{  rw   rL  )	r   rE   r   rm   rk   r   r~   r}  empty_strided)r{  r   r{  rw   rL  rC  r   re   re   rf   r<  ?  s   
"r<  c                   s   dddddd fdd
}|S )zZ
    Shim to convert X_like(...) into X(...).  For example zeros_like() into zeros().
    NF)r   rw   r{  rL  rC  c                  sj   t | d t |d tjfv d|  |d u r|  }nt|}|p%|  }t|  } |||||dS )NrL  r]  r\  )r   r~   rd  r   r   rp   rk   r   )r   r   rw   r{  rL  rC  r   creation_fnre   rf   _constant_likeW  s   

z*create_tensor_like.<locals>._constant_likere   )r  r  re   r  rf   ry  R  s   
ry  c                 C  s   t t| S r   rx  r~  re   re   rf   r  i  r`  r  c                   s   d d d d d fdd
}|S )Nr  c                  st   t |ttfs	J t| d t|d tjfv d|  t|p#|  }|p)|  }dd |D }t	 t
|||S )NrL  r]  c                 S  rv  re   )r   rf  rx  re   re   rf   r   |  ry  z7new_constant.<locals>._new_constant.<locals>.<listcomp>)rm   rk   r   r   r~   rd  r   r   rp   rw  rE   r   r   r   r{  rw   rL  r~  re   rf   _new_constants  s   z#new_constant.<locals>._new_constantre   )rv  r  re   r~  rf   new_constantr  s   r  r  c                C  s8   |d u r|   }|d u r|  }t|d ||t||dS Nr  r   rp   r  rE   r  re   re   rf   	new_empty  s   r  c                C  s  t | ttfs	J t |tttd fsJ t|d tjfv d|  t|p)t }|p1t	dj
}t|}td||| d}|  |jj}tj|jdgt|  d|_t |tjs\J dd | D } |rldd |D ntj| }tj||| ||p{d	d
|_|S )Nr]  r  r   )rv  rw   r   r   )rh  c                 S  rv  re   rw  rx  re   re   rf   r     ry  z!empty_strided.<locals>.<listcomp>c                 S  rv  re   rw  rx  re   re   rf   r     ry  F)rw   r   r   r  	is_pinned)rm   rk   r   r   r   r~   rd  r   re  rH  rw   rE   rw  r  ro   dataclassesreplacer   r0   r  r  r  r  r{  )r   r  r   r{  rw   rL  	pointwiser5  re   re   rf   r    s0   
r  c                C  s8   |d u r|   }|d u r|  }t||||t||dS r  r  )r   r   r  r   r{  rw   rL  re   re   rf   new_empty_strided  s   r  c                 C  s2   t jj|}ttt||jd}tj	
| |S )N)r  )rQ   r]   r   r   sortedr  r   __getitem__r0   r  r  )r   r  r  re   re   rf   copy_strided  s   r  c                 K  s*   | dd usJ dt|| fi |S )Nr   z(dtype should be handled by decomposition)r  rz  )r   rv  r   re   re   rf   full  s   r  c                   s   t | tsJ | dkrt| | S |  tdk}t|  | |r/t| dg} dg|  |  fdd}t	j
|  |  || dS )Nr   r-   c                   sF   t | } t|   }t| dkr|g} | S ||  < | S r  )rk   rP   indirect_indexingr   )r^  
gather_idxr   index_loaderr   r;  re   rf   rz     s   zgather.<locals>.fnrg  )rm   rA   	get_numelr  r   r   rR  r<  ri  r>   r   rp   r   )r   r   rE  sparse_gradra  rz   re   r  rf   gather  s$   	r  c                   s   |rt tjj| ||||S |rJ t| tsJ t|ts J dt| v s*J |  |  t	|
 | 
 g |
 dd   fdd}tj|  |  |dS )Nr   r-   c                   s\   t | t ksJ |  d  | d  }t|d gg | d   }|S )Nz != r   )r   rP   r  )r^  	var_index
weight_idxindices_loaderindices_ndimr=  weight_loaderweight_sizere   rf   rz   	  s   "
zembedding.<locals>.fnrg  )r  r  	embeddingr  rm   rA   r   r   ri  r   r   r>   r   rp   )weightr  padding_idxscale_grad_by_freqsparserz   re   r  rf   r    s(   

r  c                   s   t dd  D sJ ddd  D  tdd  D r"tddd t D }t|d	ks5J d
d gt  }t|t fdd|D  D ]\}}| |krXtd|||< qJ||fS )Nc                 s  s4    | ]}|d ur|  tjtjtjtjfv V  qd S r   )r   r~   r  r  r[   r  r  re   re   rf   r     s    z.check_and_broadcast_indices.<locals>.<genexpr>z)indices must be int64, byte or bool. Got c                 S  s   g | ]
}|d ur|  qS r   r   r  re   re   rf   r         z/check_and_broadcast_indices.<locals>.<listcomp>c                 s  s,    | ]}|d ur|  tjtjfv V  qd S r   )r   r~   r[   r  r  re   re   rf   r   !  s    "zFallback for bool indicesc                 S  r   re   r   r   re   re   rf   r   &  r   r   z"requires at least 1 non-None indexc                   r  re   re   r  r  re   rf   r   )  r   z.Fallback when indices is on a different device)r-  r.  r   rl   r   r  r  rp   )r  rw   
valid_idxsnew_indicesrt   r   re   r  rf   check_and_broadcast_indices  s"   
$
r  c	              
     s   dt D ]\}	}
|
|	 dkrdq
fddtD g 
tt d  d }r: nd |  |d    	f
dd}|fS )	NFr-   Tc                   s    g | ]\}}|d u r | qS r   re   )r   rt   r  r8  re   rf   r   J  r  z2index_output_size_and_inner_fn.<locals>.<listcomp>r   c           	   	     s  t | t ks
J t t ksJ t }g }d }r"dn|}d}td d D ]F}||kr8||7 }| d u rR|t | k sFJ || |  |d7 }q.| }|d us\J | }|tj|| |||  | d q.g || |d  }	d u r|S 	|S )Nr   ro  r-   r   wrap_neg)r   r  rq   rP   r  )	r^  r  	new_indexfirst_tensor_indexstart_offsetnext_idxrt   loaderr   )
r   indexed_sizer  indices_loadersnon_consecutive_tensorsoutput_sizetensor_indicestensor_sizer  r;  re   rf   rz   W  s>   

z*index_output_size_and_inner_fn.<locals>.fn)r  pairwiserl   r   )r8  r  r  r  r  r  r;  r   r  previouscurrentr  rz   re   )r   r  r  r  r  r  r  r  r  r;  r8  rf   index_output_size_and_inner_fn3  s$    


"r  c                 C  s,   t | ||\}}}tj|  |  ||dS rD  )index_impl_helperr>   r   rp   r   )r   r  r   r  rf  r  re   re   rf   
index_impl|  s   r  c           
        s   t ttfs	J |  t|  \}t|dks J ddd D }t|d   }|  fddttD }|rQd|v rQd|vrQt	dfddttD }t
||||d ||d	\}  fd	d
}	||	 fS )Nr   z Must have at least one valid idxc                 S      g | ]}|d ur|  nd qS r   rq  r  re   re   rf   r     r  z%index_impl_helper.<locals>.<listcomp>c                   s    g | ]} | d ur| qS r   re   r  )r  r8  re   rf   r     r  z0index is out of bounds for dimension with size 0c                   r  re   re   r  r  re   rf   r     r   r  c                   s    | S r   re   r   )index_inner_fnr;  re   rf   rf    r`  z#index_impl_helper.<locals>.inner_fn)rm   rk   r   ri  r  rp   r   r   r  
IndexErrorr  )
r   r  r   r  r  r  r  r  r  rf  re   )r  r  r;  r8  rf   r    s0   

r  c                 C  sB   zt | |ddW S  ty    |   ttjjdd| | Y S w )NTr   Fr  )r  r   r  r  r  rE  r  r   r  re   re   rf   rE    s   c                 C  s   t | |ddS )NFr  )r  r  re   re   rf   _unsafe_index  r  r  c                 C  s   t t| |||dddS )NTFr   may_realizeindex_put_impl_r  r   r  r  
accumulatere   re   rf   	index_put     r  c                 C  s   t t| |||dddS )NFr  r  r  re   re   rf   _unsafe_index_put  r  r  c                 C  sB   |  |   krt||   }|rt| |}t| t|d || S r  )rp   r  r   r,  r   )r  r  r  r  re   re   rf   index_put_as_masked_fill  s
   
r  c           	      C  s   ddl m} ttjtjjjj	}tjj}t
js5|d ur5||r5d}|jdd  }r1| d| }|tj_t|| ||| | S )Nr-   ),_fx_node_is_input_dependent_cudagraph_unsafezLindex_put_ fallback with boolean indexing is not compatible with CUDA graphsstack_trace Found from : 
 )utilsr  r   r  
index_put_rQ   r]   r^   ra   _overloadnamer.   graph_partitionr  r  disable_cudagraphs_reasonr0   IndexPutFallback)	r  r  r  r  r  ry  r"  r   r  re   re   rf   index_put_fallback  s   r  c                 C  s   t | |||dddS )NTr  r  r  r  r  r  re   re   rf   r       r  c                 C  s   t | |||dddS )NFTr  r  r  re   re   rf   _unsafe_index_put_  r  r  c              
     s  |rdd  t | | v rt fdd|D s|  | dkrWt|dkrW|d  tj	tj
fv rW|d }tt| t|  D ]}t|d}qGt| |g||S t rbt| |||S |D ]}|d ur}| tj	tj
fv r}t| |||  S qd|  t}	|rt|  r|	dkrt| dg} t| |||} |	dkrt| g } | S t||  }zt||  \}}
W n ty   t| ||| Y S w dd	 |D }t| tsJ |   |	dkrt| dg} t||
d   }fd
d	tt|D }t||
|||d |d\}}t||}|  }|d usJ t j||  | |||r.dnd d}t jd t | |d}t j!"||_#t j!$| |	dkrTt| g } | S )Nc                 S  sd   t | tr0t | jtjr0| j } t | tjo/t | jtjo/t| jdd o/| jj	j
tjjjju S dS )Nr"  F)rm   rA   ro   r0   r4   r|  r   r  r   r"  ra   r~   rP   r  randpermr  )indicere   re   rf   indice_slice_from_randperm  s   
z3index_put_impl_.<locals>.indice_slice_from_randpermc                 3  r  r   re   )r   r  )r  re   rf   r     r  z"index_put_impl_.<locals>.<genexpr>r-   r   ro  c                 S  r  r   rq  r  re   re   rf   r   E  r  z#index_put_impl_.<locals>.<listcomp>c                   r  re   re   r  r  re   rf   r   P  r   r  
atomic_addrw   r   rf  rh  output_indexerscatter_moder   r{  ro   )%r0   try_get_namer  r-  r  r  r   r   r~   r[   r  r  r   r.  r  $are_deterministic_algorithms_enabledr  rJ   r  r  r  rp   r   rm   rA   rk   r  r<  Scatterri  r  MutationLayoutSHOULDREMOVErQ   r]   r\  r   r]  )r  r  r  r  r   r  r  r  rE  x_ndimr  r  r  r  expected_vals_sizerf  rw   scatterr5  re   )r  r8  rf   r     s   





r  r  c                   sT   t | |ddd\}}  |   fdd}tj|  |  ||dS )NFr  c                   sB   j tjkrt tj}n }t| fddS )Nc                     s    S r   re   re   )_unsafe_index_fnr^  self_loaderre   rf   rL    r  z8_unsafe_masked_index.<locals>.inner_fn.<locals>.<lambda>)r   r~   r[   rP   r  r  )r^  mask_valr  fillr  mask_loaderr  r   rf   rf    s   z&_unsafe_masked_index.<locals>.inner_fnrg  )r  ri  r>   r   rp   r   )r  r  r  r  rh  r  rf  re   r  rf   _unsafe_masked_index  s   r  c                   s@   t ||d}|   fddtt D }t| ||ddS )Nr   c                   s6   g | ]} | rt  | |  | d  ndqS r5  rO  r  r  r/  re   rf   r     s    (z7_unsafe_masked_index_put_accumulate.<locals>.<listcomp>T)r  )r   r   r  r   r  )r   r  r  r  masked_valueclamped_indicesre   r  rf   #_unsafe_masked_index_put_accumulate  s   
r  c                 C  s   t |t || S r   )rP   r  r  r  minmaxre   re   rf   rP       rP  c                 C  r+  r   )r  rd  copy_)r  r-  r   r  rN  r?  output_viewre   re   rf   as_strided_scatter  r/  r  c                 K  s   t t| |||fi |S r   )scatter_r  )r   r   rE  r-  r   re   re   rf   r    s   r  r  include_selfry  r  r  c             	   C  sf   t |t}t| || ttj|r| nt||r| jnd|r1t	j
| ||||||d |S d S )Nznot implr   )rm   rA   rO   r   r	   r~   r   r   rp   r0   ScatterFallback)ry  r  r   rE  r-  r  r  src_is_tensorre   re   rf   scatter_fallback  s(   

		r  r  c                C  sr   |dv sJ |d u r$t tjtjjjj}t|| ||||d}|d ur$|S |dkr+d}n|dkr1d}t	| ||||S )N)Nr   multiplyr  r   r  r  prod)
r   r  r  rQ   r]   r^   ra   r  r  scatter_reduce_)r  r   rE  r-  r  ry  fallback_resultre   re   rf   r    s   r  c                 C  s   t t| |||S r   )scatter_add_r  r   r   rE  r-  re   re   rf   scatter_add  r*  r  c                 C  s   t | |||dS )Nr  )r  r  re   re   rf   r
    r  r
  c                 K  s   t t| ||||fi |S r   )r  r  )r   r   rE  r-  reduction_typer   re   re   rf   scatter_reduce  s   r  )r  c             	     s8  |dv sJ t tj dkrdtj v sJ dttr$tttjj |||d}|r5|S tt	s<J dt
| v sFJ t  }|dkrVtdgtt	rit  dkritdgt|t	r|t | dkr|t|dg}| dkrS t    | tt	r nd  fdd	}fd
d}	dd }
 }|d usJ |stj| fdd| |d d}tjd t|d}tj||_tj| tj| |	| ||
|d}tjd t|d}tj||_tj| |dkrtg S )N)Nr  r  meanamaxaminr-   r  zKaten.scatter_reduce_.two is not the unique overload of aten.scatter_reduce_r   r   r   c                   sD     }t|}t| }tj| |dkrdn|  dd| < |S )Nr   r-   F)r  )r   r   rk   rP   r  )r^  r/  rE  indirect_idx)r   r  r  re   rf   r  &  s   
z'scatter_reduce_.<locals>.output_indexerc                   s   r| S t   S r   rP   r\  r   r   )r  r-  rQ  re   rf   rz   0  s   zscatter_reduce_.<locals>.fnc                 S  s   | dkrdS | d u sJ d S )Nr  r  re   r  re   re   rf   backend_reduce_str8  s   z+scatter_reduce_.<locals>.backend_reduce_strc                   s   t d  S r  r  ru  )r  re   rf   rL  H  r  z!scatter_reduce_.<locals>.<lambda>r  r  )r   r  r  r   rm   r   r  r  r  rA   r   r   r   r  r  rR  r  ri  rp   r0   r  r  r  rQ   r]   r\  r   r]  )r  r   rE  r-  r  r  r	  rE  r  rz   r  rw   zero_outr5  r  re   )r   r  r  r-  rQ  rf   r    s   







r  scales_xtuple[float | None, ...]exactc           
        s   |    |  |   d  |  d   }dd D t|ks)J |}dd t|D t|D ]\}}|d urGd| |< q9 fddfdd}	tj|  | 	 |	g ||d	S )
Nc                 S  s   g | ]	}t jj|qS re   r  r  re   re   rf   r   x  r
  z&upsample_nearestnd.<locals>.<listcomp>c                 S  s   g | ]\}}|| qS re   re   )r   rt   ore   re   rf   r   }  r>  r  c                   s\   t | tj}  rt | t dtj} t | t |tj} t | tj} t j	| |ddS )N      ?Fr  )
rP   r[  r~   r  r   r\  r  r  r  r  )r   r  r   )r  re   rf   scale_fn  s   z$upsample_nearestnd.<locals>.scale_fnc                   sB   |  d  }| d   }g |fddt | D S )Nc                   s   g | ]\}}} |||qS re   re   )r   rt   r  r   )r  re   rf   r     r   z2upsample_nearestnd.<locals>.fn.<locals>.<listcomp>)r  )r^  r   r>  )i_sizes
inv_scalesr  r  r;  re   rf   rz     s
    zupsample_nearestnd.<locals>.fnrg  )
realize_hintri  r   r   r  rl   r>   r   rp   r   )
r   r  r  r  r  batcho_sizesrt   r  rz   re   )r  r  r  r  r  r;  rf   upsample_nearestndm  s(   
r!  float | Nonec                 C  s   t | ||fddS )Nr-   r  r!  r   r  r  re   re   rf   upsample_nearest1d  r*  r&  c                 C  s   t | ||fdddS )Nr-   Tr  r  r$  r%  re   re   rf   _upsample_nearest_exact1d  r  r(  scales_hscales_wc                 C  s   t | |||fddS )Nr+   r#  r$  r   r  r)  r*  re   re   rf   upsample_nearest2d  s   r,  c                 C  s   t | |||fdddS )Nr+   Tr'  r$  r+  re   re   rf   _upsample_nearest_exact2d  s   r-  scales_dc                 C  s   t | ||||fddS )Nr   r#  r$  r   r  r.  r)  r*  re   re   rf   upsample_nearest3d  s   r0  c                 C  s   t | ||||fdddS )Nr   Tr'  r$  r/  re   re   rf   _upsample_nearest_exact3d  s   r1  c                   s   t  fdd|D S )Nc                 3  s    | ]	}t | V  qd S r   ra  r   r   re   rf   r     r  z$_create_constants.<locals>.<genexpr>)r   )r   ru   re   r   rf   r    s   r  c                   s:   |   |   fdd}tj|  |  |dS )Nc                   sF   t | } t| tksJ  D ]}| d | |  | |< q| S r  )rk   r   )r^  r   r  r  r;  re   rf   r    s
   zrev.<locals>.loaderrg  )ri  r   r>   r   rp   r   )r   r  r  re   r2  rf   rev  s   r3  paddingSequence[int]rv  c                 C  s  dd }| r	dS t |dkst |  dkrdS |   t| tjrBt| jtjrBt| jjtjs=t	j
rBt| jjtjrB| jjjsDdS |   t| \}}|j}|d dkrZdS |d dksl|d dksl|d dkrndS |d }|dkrxdS |d }|jd }	||	| k rdS | jjj}
|jd |jd | g}|tjj|
< t| ||j|j}t|d|	|	| d	d
}t|| td d  d7  < |S )z
    This optimization changes the semantics of padding from 'clone'
    style to 'view' style.

    Thanks to functionalization, this change can still maintain numerical
    correctness.
    c                  S  sL   t jj} | du r
dS t| j}t|dkr$|d jtjj	tj
j	fv r$dS dS )a  
        Conservatively check if padding can be fused with downstream op.
        1. if the downstream op is a sum, then there is little benefit to
           do inplace padding
        2. if the downstream op is a matmul, doing inplace padding can
           save membw.
        NTr-   r   F)rQ   r]   r^   r   r_   r   ra   r  mmr  addmm)r^   r_   re   re   rf   _padding_can_be_fused  s   
z6inplace_constant_pad_nd.<locals>._padding_can_be_fusedNr   r+   r-   r   r   F)r   re  rf  rP  inductorinplace_padding)r   r   r  rm   r0   rA   ro   r   r  r.   can_inplace_pad_graph_inputInputBufferr   freeze_layoutr~  r  r   rQ   r]   buffer_to_padded_sizerd  ra  ru  fill_r   )r   r4  rv  r8  r  r{  r  npadstride0rowsizebufnamepadded_size	resized_xsliced_xre   re   rf   inplace_constant_pad_nd  s\   

$


rG  c                 C  s,  t dd |D sdS |  }t|}tt|ddd |ddd }d}d}t|D ](\}\}	}
|	dkr9 dS |
dkrM|durD dS |d | }|
}q,|
dk rT dS q,|du r[dS |  }|durj|jdkrjdS t|}|||< |  }t	||}t
||||d}td	 d
  d7  < t| |g|S )a7  Decompose right-pad into cat([x, fill], dim) and delegate to cat lowering.

    The cat lowering already has heuristics for choosing between pointwise_cat
    (fusion) and ConcatKernel (memory planning / zero-copy).  By routing through
    cat() we reuse those heuristics rather than duplicating them here.
    c                 s  s    | ]}t |tV  qd S r   )rm   r   r   r  re   re   rf   r   O  s    z_pad_as_cat.<locals>.<genexpr>Nr+   r-   r   r   r4  r9  pad_rewritten_as_cat)r-  r   r   rk   r  rl   rp   r   r   r   rz  r   r  )r   r4  rv  r  rE  	pad_pairspad_dim
pad_amountrt   leftr  rw   	pad_shaper   fill_value_typed
pad_tensorre   re   rf   _pad_as_catE  s@   
"rQ  c              	     s  t |d dks
J tdd |D rt| S tjr$t| |}|r$|S t| |}|d ur0|S |  }tt	tt
|d d d |dd d  t |t   g  D ]\}}tjj||f qUt|d  }g t
 |d  D ]\\}}	}
|
 |t|
| |	  qyt |t |ksJ t|   fddfdd	}|  tj|  |  ||d
S )Nr+   r   c                 s  r,  r   Nre   rH  re   re   rf   r   }  r  z"constant_pad_nd.<locals>.<genexpr>r-   c                   s~   g }t  d  D ]\}\}}}|dkr|t|d |dkr+|t|| qttj|}t| fddS )Nr   c                     rS  r   re   re   )rE  r;  re   rf   rL    rU  z/constant_pad_nd.<locals>.mask.<locals>.<lambda>)	r  rq   range_mask_lowrange_mask_highr"  r  rP   r  r  )rE  r  r^  r  r  rI  )boundsrv  
mask_sizesr  r;  ru  rf   r    s   "zconstant_pad_nd.<locals>.maskc                   sZ   t | d  }t| d   D ]\}\}}|||  qt|t| ks)J |S r   )rk   r  rq   r   )rE  r  r^  r  _high)bounds_precompr  r  re   rf   	offset_fn  s
   z"constant_pad_nd.<locals>.offset_fnrg  )r   r-  r  r.   r:  rG  rQ  r   rk   r7  r  rq   rQ   r]   r   lookup_precomputed_sizer   r<  r   r   ri  r>   r   rp   )r   r4  rv  rr   r  lhr  r  r  r   rY  re   )rU  rX  rv  r  rV  r  r;  rf   r  z  s@   *

r  rt   
sympy.Exprsympy.Expr | intc                 C  s&   t t | tjt t|tjS r   )rP   r  r[  r~   r  r   rf  )rt   r  re   re   rf   rS    s   rS  c                 C  s    t t | tjt |tjS r   )rP   r  r[  r~   r  )rt   r  re   re   rf   rT    s   rT  c                 C  s   t t| |t| |S r   )rP   r  rS  rT  )rt   r  r  re   re   rf   
range_mask  s   r_  r  c                   sF       d   pdg   fdd}|S )Nr   c                   s|   | d   |  d   t tj fddtD }r1t| fddS t| 	fddS )Nc                   s.   g | ]}t |  | |  |  qS re   )r_  r  )r\  ih	padding_hre   rf   r     s   . z=constant_boundary_condition.<locals>.load.<locals>.<listcomp>c                     s   t  dg S )Nr   )constant_boundary_conditionre   )r   r`  pad_fill_valueprefixr   re   rf   rL    s    
z;constant_boundary_condition.<locals>.load.<locals>.<lambda>c                     s   g  S r   re   re   )r`  re  r;  re   rf   rL    r  )r"  r  rP   r  r  r  )rE  r  r   rv  r\  rd  r4  ra  r   r;  )r`  re  rf   rt    s   	z)constant_boundary_condition.<locals>.load)r   ri  )r   rv  r4  rd  r   rt  re   rf  rf   rc    s
   rc  dilationc          	      C  s   |d u rdgt | }t| d||   || || d   || d  || }|rwt| d||   || || d   d|| d   || }tjjt|d ||  |  ||  drd|d8 }tjjt||rud}||fS |}||fS )Nr-   r+   r   F)	r   r'   rQ   r]   r   r  r   rE  r  )	r   rt   kernel_sizer  r4  	ceil_moderh  x_outx_altre   re   rf   pooling_size  s4   .
"rm  c                C  s    t | |} ttj| }|dkS )N   )rK   r"  r  r  r  )ri  n_dimwindow_sizere   re   rf   %should_fallback_max_pool_with_indices	  s   
rq  assert_fallbackc                C  s   |dkr	dg| }|dkrdg| }|s|}t ||}t ||}t ||}t ||}t| ts1J t||ks9J t||ksAJ t||ksIJ t||ksQJ t|  |d |d fv saJ t||d}|d urq||ksqJ |||||fS )Nr   r-   r+   ro  )rK   rm   rA   r   r   rq  )r   ri  r  r4  rh  ro  rs  use_fallbackre   re   rf   max_pool_checks  s(   





 rv  c             
     sN  |    | jd   }| j d  t fddtD  \} | j}	|	tju r2dn|	jr9tdnt	|	j
}
t|t| }tsXt sXtdd D r`t| |
dn|  fdd	}tjd
| |  |	|	||d}tjd| |  tj|	||d}t|jjtr|  t|jjtr|  ||fS )Nc                   s&   g | ]}t | | d qS )rg  rm  r  )rj  dhwrh  ri  r4  r  re   rf   r   ;  s    z*_max_pool_with_offsets.<locals>.<listcomp>F-infc                 s  s    | ]}|d kV  qdS r5  re   r  re   re   rf   r   K  r  z)_max_pool_with_offsets.<locals>.<genexpr>rb  c                   sJ   | d   }|  d    fddt D }g ||S )Nc                   4   g | ]} | |  | |   |  qS re   re   r  bhrh  r4  reduction_idxr  re   rf   r   S      &z<_max_pool_with_offsets.<locals>.fn_inner.<locals>.<listcomp>r  )r^  r}  re  r`  )rh  ro  r4  r  r;  r|  r}  rf   fn_innerP  s   z(_max_pool_with_offsets.<locals>.fn_innerr  r  
input_noderw   	dst_dtyper  rf  rh  reduction_rangesargmax)r  r/  r  r  r   r~   r[   r  rB  r  r  rk   r.  rc  ri  r?   r   rp   r  rm   ro   r  )r   ri  r  r4  rh  rj  ro  r  dhw_outr   	min_valuer=  r  r  offsetsre   )rj  rx  rh  ri  ro  r4  r  r;  rf   _max_pool_with_offsets,  sV   
	
"	

r  c           
   
   C  s   t |}t| |||||dd\}}}}}tjdd t| ||||||d\}}	|t|	tjfW  d    S 1 s:w   Y  d S )NFrr  rn  unroll_reductions_thresholdrt  )r   rv  r.   r   r  r  r~   r  )
r   ri  r  r4  rh  rj  ro  r  r  r  re   re   rf   !_low_memory_max_pool_with_offsetsw  s,   	

	$r  r  ri  Sequence[int | torch.SymInt]
input_sizeincrements_to_indexlCallable[[Sequence[int | torch.SymInt], Sequence[int | torch.SymInt]], torch._inductor.virtualized.OpsValue]c                   sZ   t |  tttj fdd}tj	| 
 tj||  d}|S )Nc                   sJ   | }t |}t|} | |}t t| d  tjS r   )rP   r  r/   _flattened_index_to_ndr[  _flatten_indexr~   r  )r^  ra  offset_sympyr}  idhwr  r  ri  ro  offsets_loaderrp  re   rf   offsets_to_indices  s   
z4_pool_offsets_to_indices.<locals>.offsets_to_indicesrg  )r   ri  r   rL  r"  r  r  r  r>   r   rp   r~   r  r   )r  ri  r  r  r  r  re   r  rf   _pool_offsets_to_indices  s   		r  c                   s(   t | fdd}t| |||S )Nc                   s,   |  d    fddt D S )Nc                   rz  re   re   r  r{  re   rf   r     r~  zX_low_memory_max_pool_offsets_to_indices.<locals>.increments_to_index.<locals>.<listcomp>r  r^  r}  rh  ro  r4  r  r  rf   r    s   zD_low_memory_max_pool_offsets_to_indices.<locals>.increments_to_index)r   r  )r  ri  r  r  r4  rh  r  re   r  rf   '_low_memory_max_pool_offsets_to_indices  s
   r  c              	   C  s^   t | |||||d\}}}}}t| ||||||d\}}	t|	|| j| d  |||}
||
fS )Nrt  )rv  r  r  r/  )r   ri  r  r4  rh  rj  ro  r  rr   r  r  re   re   rf   _max_pool_with_indices  s   	
	r  c              	   C     t | |||||ddS Nr+   rt  r  r   ri  r  r4  rh  rj  re   re   rf   max_pool2d_with_indices     	r  c              	   C  r  Nr   rt  r  r  re   re   rf   max_pool3d_with_indices  r  r  c                   s`  dkrddg|dkrddg}st |tsJ tdks#J tdks+J tdks3J t|dks;J t| dv sEJ |   |  }t |trt |jjtr|jj}	|	 }
|
d usgJ t	j
d t	j|
|	 |	 d|	d}|  | }n| }|d ur|d dkp|d uo|d dk}tdd |D rt| ||||S | ^ }}
|  ^ }| |   t| }tfd	dtd d D tfd
dtd d D 		 }|dkrt| ||||S |  	
fdd}tj|  |  ||d}|r.t	j|S |S )Nr   r-   r+   r0  )rw   r   r   r  c                 s  s    | ]}|d kV  qdS r5  re   r  re   re   rf   r   :  r  z3max_pool2d_with_indices_backward.<locals>.<genexpr>c              	   3  @    | ]}t t|d  t d t| d   d   dV  qdS r   r-   Nr  r'   r   r\  ri  r  re   rf   r   G  
    .
c              	   3  @    | ]}t t|d  t dt| d   d   d V  qdS r-   r   Nr  r   wr  re   rf   r   K  r  rn  c                   sV  | ^ }}}t |
 | tj}|d  }|d  }t t|d  d  d tj}t t|d  d  d tj}t t|d d tj}t t|d d tj}t |t dtj}t |t dtj}t |t tj}t |t tj}d }	tD ]}
t	D ]}t 	|t |
tj}t 	|t |tj}g |t j
t |t |t dtjd ddt j
t |t |t dtjd dd}|} |}t ||}|	d u rt ||t dtj}	qt t t ||t |||}t |t 	|	||	}	qq|	d us)J |	S )Nr   r-   Fr  ro  r  )rP   r[  r~   r  r'   r  r\  r  r  r   r  r  rO  r   r  r  r  )r^  re  r\  r  
index_testphstartpwstartphendpwendgradientph_pw_phpw
grad_indexindex_actual	grad_partr   r  grad_loaderh_window_sizer  indices_sizeri  r4  pooled_heightpooled_widthr  w_window_sizewidthre   rf   rz   Z  sl     


#z,max_pool2d_with_indices_backward.<locals>.fnrg  )rm   rA   r   r   r  r:  ro   r>   rp   r0   r  r  r   decide_layoutrb  r.  )fallback_max_pool2d_with_indices_backwardri  rk   r  r  r   r  r
  )grad_outputr   ri  r  r4  rh  rj  r  	gO_stridero   rw   x_bufferx_strideis_channels_last_batch_heightr  r=  rp  rz   rr   re   r  rf    max_pool2d_with_indices_backward  s   	

 ;r  r  c                   s   |    fdd}|S )Nc              
     s   |\|\ |\}}t t t   tjt |tjt t  tjt |tj}t | fddS )Nc                     s   g    S r   re   re   )h_start_indexr`  iwre  w_start_indexr;  re   rf   rL    r  z3pad_adaptive_loader.<locals>.load.<locals>.<lambda>)rP   r  r  r[  r~   r  r  )re  
incrementsstart_indicesend_indicesh_end_indexw_end_indexr  pad_valr;  )r  r`  r  re  r  rf   rt    s$   z!pad_adaptive_loader.<locals>.loadrq  )r   r  rt  re   r  rf   pad_adaptive_loader  s   r  c           
      C  sL   t j| ||d}t j|||d}t j| ||d}t j|||d}	||||	fS )N)out_diminp_dim)r"  r3  )
rh  ri  h_inw_inh_outw_outr  r  r  r  re   re   rf    compute_indices_adaptive_pooling  s
   r  c                   sD   |\}}|\}}	t | |||||	\  fdd}
|
S )Nc                   s   | ^ }}}|} |}|}|}d }	t td td D ]\}
}|||
|g||g||g}|	d u r>|}	q&||	}	q&|	S r  )r  productr  )r^  r  re  r|  bwr  r  r  r  r  r`  r  r  h_end_index_fnh_start_index_fnkernel_maxes
pooling_fnw_end_index_fnw_start_index_fnre   rf   rz     s"   $z _adaptive_pooling_fn.<locals>.fnr  )rh  ri  r  in_sizes	out_sizesr  r  r  r  r  rz   re   r  rf   _adaptive_pooling_fn  s   r  c           
        sF   |\}|\}}t | ||||\  fdd}	|	S )Nc                   s   | ^ }}}|} |}|}|}d }	d }
t td td D ]>\}}||||g||g||g}t||  | | tj}|
d u rO|}
ntt||	||
}
|	d u ra|}	q(||	}	q(|
S r  )	r  r  r  rP   r[  r~   r  r   gt)r^  r  re  r|  r  r  r  r  r  maxvalmaxindexr`  r  r  rE  r  r  r  r  r  r  r  re   rf   rz     s0   $z)_adaptive_pooling_fn_with_idx.<locals>.fnr  )
rh  ri  r  r  r  r  r  r  r  rz   re   r  rf   _adaptive_pooling_fn_with_idx  s   #r  c                   s    tjkrtdttsJ t|dksJ    ^ }}}t	j
j|}t	j
j|}|\}}||krD||krDtS |dksL|dkr_g |||}t|   dS || dkrz|| dkrzt||t||g}t|S t|| d |}	t|| d |}
t|||g }  }|	|
 }|dkrt|S dd }d	d
 }t|||	|
g||g||gtjd tt fdd}tj |||d}|S )Nz0'adaptive_avg_pool2d' not implemented for 'Long'r+   r   r4  r-   rn  c                 S     t | | |S r   r'   rE  r  r  re   re   rf   rh  P     z)_adaptive_avg_pool2d.<locals>.start_indexc                 S     t | d | | d |S r  r  r  re   re   rf   ri  S  r  z'_adaptive_avg_pool2d.<locals>.end_indexrh  ri  r  r  r  r  c                   s   t  | t | S r   )rP   truedivr  r   fn_sumones_loaderr   re   rf   rz   a  s   z _adaptive_avg_pool2d.<locals>.fnrg  )r   r~   r  r  rm   rA   r   r  r   rQ   r]   r   r  r  r<  rp   r'   
avg_pool2drD   rk   fallback_adaptive_avg_pool2dr  rP   r   r  	ones_liker>   r   )r   r  r  r  r  r  r  o_sizeri  h_kernel_maxw_kernel_maxr=  r   rp  rh  ri  rz   rvre   r  rf   _adaptive_avg_pool2d*  sV   

	r  c                   s    tjkrtdttsJ t|dksJ    ^ }}}t	j
j|}t	j
j|}|\}}|dks@|dkr]g |||}t|   dt|tj dfS || dkrk|| dkrktt|| d |}t|| d |}	t|||g }
  }||	 }|dkrt|S dd }d	d
 }t||||	g||g||gtjdt||||	g||g||gtjd fdd} fdd}tj |||
d}tj tj||
d}||fS )Nz,adaptive_max_pool2d not implemented for Longr+   r   r4  r-   rn  c                 S  r  r   r  r  re   re   rf   rh    r  z(adaptive_max_pool2d.<locals>.start_indexc                 S  r  r  r  r  re   re   rf   ri    r  z&adaptive_max_pool2d.<locals>.end_indexr  c                       | t tdS Nry  r  rB  r   )inner_func_max_valr   re   rf   inner_fn_max_val  r  z-adaptive_max_pool2d.<locals>.inner_fn_max_valc                   r  r  r  r   )inner_func_max_idxr   re   rf   inner_fn_max_idx  r  z-adaptive_max_pool2d.<locals>.inner_fn_max_idxrg  )r   r~   r  r  rm   rA   r   r  r   rQ   r]   r   r  r<  rp   
ValueErrorrD   rk   fallback_adaptive_max_pool2dr  rP   r  r  r>   r   )r   r  r  r  r  r  r  r  r  r  r=  r   rp  rh  ri  r
  r  r  rire   )r  r	  r   rf   adaptive_max_pool2du  sp   
		r  c                   s<            fdd}|S )Nc           
        sX    }t|dkr@t| dkrd| d d   g}n.t| dkr4| d | d d   g}nddd   g}ng | d   }t| }t tj}td tj}tt|tj	t|tj	}t
t|dd|}t|| | t||  }t|tj}t||}	tt
|	||tS )Nr   r-   r   r+   )r   r   rP   r[  r   r~   r  r  r  float64r   rO  r)  r  r  r   rL  )
re  rt   samples_shapesamplei_exprdiffout_sz_exprrd  seq_ir  r   in_sz	kernel_szndimsout_szsamplessamples_loaderre   rf   rt    s&     z)_fractional_pooling_offsets.<locals>.loadrq  )r  r  r  r  r   r  rt  re   r  rf   _fractional_pooling_offsets  s   #r  c                 C     t | |||ddS r  _fractional_max_poolr   ri  r  random_samplesre   re   rf   fractional_max_pool2d  r*  r%  c                 C  r   r  r!  r#  re   re   rf   fractional_max_pool3d  r*  r&  c                   s\  |    | jd   | j d  }tjdd fddtD  |  fdd} fddt|t }|  }tj	d	| | 
 ||||d
}	tj	d| | 
 tj|||d
}
t|	tsrJ |	t|	jjtr}|	  t|
tsJ |
t|
jjtr|
  t|
| j}|	|fW  d    S 1 sw   Y  d S )Nrn  r  c              
     s    g | ]}t  |d qS ))r  r  r  r  r  r   )r  r  )inp_dhwri  ro  r  r$  re   rf   r     s    	z(_fractional_max_pool.<locals>.<listcomp>c                   s$   | d   }g | | |S r   re   r^  r}  re  )r  ro  r;  re   rf   r    s   z&_fractional_max_pool.<locals>.fn_innerc                   s8   | d   |  d    fddt D S )Nc                   s&   g | ]}|  | |  qS re   re   r  )bdhwdhw_index_fnre  r}  re   rf   r     s    zE_fractional_max_pool.<locals>.increments_to_index.<locals>.<listcomp>r  r  )r*  ro  )r)  re  r}  rf   r    s
   z1_fractional_max_pool.<locals>.increments_to_indexr  r  r  )r  r/  r.   r   r  ri  rk   r   r?   r   rp   r~   r  rm   rA   ro   r  r  )r   ri  r  r$  ro  r  r  r=  r   r  r  r  re   )r*  r  r'  ri  ro  r  r$  r;  rf   r"    sT   "	


$r"  c                   s       ^ }}}tjj|}tjj|}|^ }}}	|| dkr;||	 dkr;tt||t||	gddS t||}
t||	}dd fdd}t	||
|g||g||	gt
jd  fd	d
}tj  |t|d}|S )Nr   r-   )divisor_overridec                 S  s   t | | t|S r   )r&   r   rL  r  re   re   rf   rh  V  r  z0upsample_nearest2d_backward.<locals>.start_indexc                   s    | d ||S r  re   r  )rh  re   rf   ri  Y  r  z.upsample_nearest2d_backward.<locals>.end_indexr  c                   s    | t S r   )r  r   )r  r   re   rf   rz   e  r  z'upsample_nearest2d_backward.<locals>.fnrg  )r  r   rQ   r]   r   r  r  r'   rD   r  rP   r   r>   r   rp   r   rk   )r   r  r  r)  r*  r  inp_hinp_wout_hout_wr  r  ri  rz   r  re   )r  rh  r   rf   upsample_nearest2d_backwardA  s:   

	r0  re   c              
   C     t | ||||||ddS )Nr+   rb  _avg_poolndr   ri  r  r4  rj  count_include_padr+  re   re   rf   r  s     
r  c              
   C  r1  )Nr   rb  r2  r4  re   re   rf   
avg_pool3d  r6  r7  c                   s  ssdg t t t t| ts!J tks)J tks1J tks9J t|  d d fv sIJ |   |  d   }|   d  t fddtD  \}	}
tszt|
rt	| ddd}n| 
 d	}t|t|	 }|  }t| tjdd
}fdd}ttj}|dkrtdd tD rtd  }||  |S |dkrtjddnt }|  }|d usJ | tjd| |||||d}W d    n	1 sw   Y  t|jdrt|jjtr|  |r|r,|r$|n|}t||}nfdd}t j|  |||d}t||}t!||S )Nr   r-   r+   c              	     s"   g | ]}t | | qS re   rw  r  )rj  r\  ri  r4  r  re   rf   r     s    z_avg_poolnd.<locals>.<listcomp>r  rb  TF)r   r   c                   sL   | d   }|  d   | fddt D g |S )Nc                   s,   g | ]} | |  |  |  qS re   re   r  )r|  r`  r4  r  re   rf   r     s   , z1_avg_poolnd.<locals>.fn_inner.<locals>.<listcomp>r  r(  )r   r4  r  r;  )r|  r`  rf   r    s
   z_avg_poolnd.<locals>.fn_innerrn  c                 s  s*    | ]\}}t jjt||V  qd S r   )rQ   r]   r   statically_known_truer   Ne)r   r   r  re   re   rf   r     
    
z_avg_poolnd.<locals>.<genexpr>r  r  r  ro   c                   s   |  d  }g }t D ]<}|| |  |  }t||  | |  } s;t|d}t|| }t|| tj}|| qt	
tj|S r  )r  r   rI  rJ  rP   r[  r~   r  rq   r"  r  r  )r^  r|  divide_factorsrt   hstarthendfactor)r5  r   r\  ri  r4  r  re   rf   fn_count  s    z_avg_poolnd.<locals>.fn_countrg  )"rK   rm   rA   r   r   r  r  r  r.  rc  ri  rk   r   r   r   rP  r"  r  r  r  fallbacks_avg_poolndr.   r   
contextlibnullcontextrp   r?   r   r   ro   r  div_primr>   r  )r   ri  r  r4  rj  r5  r+  r   r  r  
ceil_modeshad_paddingr=  r   output_dtyper  rp  fallbackcontextrw   r  divisorr  r?  divide_factorre   )rj  r5  r   r\  ri  r4  r  r;  rf   r3    s   




 

r3  c                   s  d u sdksJ dssddgt | tsJ t |ts$J tdks,J tdks4J tdks<J t| dv sFJ |   | ^ }td|\}	}
td|\}}|  d pwd pw|
pw||  ^ }	
t| }| }t	fddt
d d D t	fddt
d d D  }|d	krt| ||S fd
d  	
fdd}tj|  |||d}|S )Nr   divisor must be not zeror+   r0  r-   c              	   3  r  r  r  r  r  re   rf   r   X  r  z&avg_pool2d_backward.<locals>.<genexpr>c              	   3  r  r  r  r  r  re   rf   r   \  r  rn  c              	     sX  t d tj}t d tj}t d tj}t d tj}t d tj}t d tj}t t | ||}t t |||}	t t ||t t  tj|}
t t |	|t t tj|}t 	|t dtj}t 	|	t dtj}	t |
t  tj}
t |t tj}t t |
|t ||	}|S )z{
        This computes the scaling factor that we will divide an element
        by when `count_include_pad=False`
        r   r-   )
rP   r\  r~   r  r  r  r  r   r[  r  )r  r  stride_hstride_wpad_hpad_wkernel_hkernel_wr<  wstartr=  wendrJ  )heightri  r4  r  r  re   rf   !compute_pool_size_without_paddingo  s,   

z>avg_pool2d_backward.<locals>.compute_pool_size_without_paddingc                   sR  | ^ }}}|d  }|d  }t t|d  
d  
d tj}t t|d  
d  
d tj}t t|
d d tj}t t|
d d tj}t |t dtj}t |t dtj}t |t tj}t |t 	tj}d }tD ]}	tD ]}
t 	|t |	tj}t 	|t |
tj}d ur}nssd d  }n ||}t 
g |t jt |t |t dtjddt jt |t |t dtj	dd|}t t ||t ||}|d u rt ||t dtj}qt |t 	|||}qq|d us'J |S )Nr   r-   Fr  r  )rP   r[  r'   r~   r  r  r\  r  r  r   r  r  r  r  r  r   r  )r^  re  r\  r  r  r  r  r  r  r  r  r  r  r  partr  )rU  r5  r+  r  r  rE  ri  r4  r  r  r  r  re   rf   rz     sv     
	


*zavg_pool2d_backward.<locals>.fnrg  )rm   rA   r   r   r  rm  ri  rk   r   r  r  fallback_avg_pool2d_backwardr>   r   rp   )r  r   ri  r  r4  rj  r5  r+  r  _h_out
ceil_mode1_w_out
ceil_mode2r=  r   rp  rz   r  re   )rU  r5  r+  r  r  rE  rT  ri  r4  r  r  r  r  r  rf   avg_pool2d_backward/  sb   "Ar\  c                   s  d u sdksJ ds	
sg d
t | tsJ t |ts$J t	dks,J tdks4J t
dks<J t| dv sFJ |   | ^ }td	
|\}	}
td	
|\}}td	
|\}}|  t
p|
p|p||  ^ }t| }|	 }	fdd	t
dD \  }|d
krt| |	
|S 	
fdd  	
fdd}tj|  |||d}|S )Nr   rK  )r   r   r   r   )r   r   r-   r+   c                 3  s6    | ] t  fd dt  d D V  qdS )c                 3  s<    | ]}t |   t d |       dV  qdS r  )r  r  )rt   ri  r  re   rf   r     s
    *
z0avg_pool3d_backward.<locals>.<genexpr>.<genexpr>r+   N)r  r  )r   r  )rt   rf   r     s    
z&avg_pool3d_backward.<locals>.<genexpr>}   c              	     s  dd D \}}}dd D \}}}dd D \}	}
}dd t | ||g|||g|||gD \}}}dd t |||g|	|
|g g|||gD \}}}dd |||fD \}}}dd t |||g gD \}}}ttt||t||t||}|S )	Nc                 s      | ]
}t |tjV  qd S r   rP   r\  r~   r  rx  re   re   rf   r     r  zQavg_pool3d_backward.<locals>.compute_pool_size_without_padding.<locals>.<genexpr>c                 s  r^  r   r_  rH  re   re   rf   r     r  c                 s  r^  r   r_  r  re   re   rf   r      r  c                 s  s*    | ]\}}}t t |||V  qd S r   )rP   r  r  )r   r  r  padre   re   rf   r   $  s
    
c              
   s  s>    | ]\}}}}t t ||t t |tj|V  qd S r   )rP   r  r   r[  r~   r  )r   re  r   r   r`  re   re   rf   r   *  s    

c                 s  &    | ]}t |t d tjV  qdS rR  rP   r  r\  r~   r  )r   re  re   re   rf   r   5  
    
c                 s  *    | ]\}}t |t |tjV  qd S r   rP   r  r[  r~   r  )r   rf  r   re   re   rf   r   9  r:  )r  rP   r  r  )pdr  r  stride_drL  rM  pad_drN  rO  kernel_drP  rQ  dstartr<  rR  dendr=  rS  rJ  )depthrT  ri  r4  r  r  re   rf   rU    s8   $z>avg_pool3d_backward.<locals>.compute_pool_size_without_paddingc                   sJ  | ^ }}}}dd t |||gD \}}}dd t |||gD \}}}dd t |||gD \}}	}
dd |||fD \}}}dd t ||	|
g	
gD \}}	}
d }tD ]}tD ]}tD ]}dd t |||g|||gD \}}}d ur}nssd d	  d
  }n |||}tg |tjt|t|td	tj		ddtjt|t|	td	tj	
ddtjt|t|
td	tj	dd|}t
t
t||t||	t||
}|d u rt||tdtj}qjt|t|||}qjqdq^|d us#J |S )Nc                 s  s    | ]	\}}|| V  qd S r   re   )r   r   r`  re   re   rf   r   D  r  z2avg_pool3d_backward.<locals>.fn.<locals>.<genexpr>c                 s  s2    | ]\}}}t t|| | |tjV  qd S r   rP   r[  r'   r~   r  )r   r   r   r  re   re   rf   r   F  s
    
c                 s  s,    | ]\}}t t||d  tjV  qdS r5  rm  )r   r   r  re   re   rf   r   K  s
    
c                 s  ra  rR  rb  )r   pstartre   re   rf   r   P  rc  c                 s  rd  r   re  )r   pend
pooled_dimre   re   rf   r   T  r:  c                 s  rd  r   )rP   r   r\  r~   r  )r   rn  p_re   re   rf   r   `  r:  r   r-   r+   Fr  r  )r  r  rP   r  r  r  r  r\  r~   r  r  r  r   r  r   )r^  re  r  r\  r  pdstartr  r  pdendr  r  r  pd_r  r  rf  r  r  r  rV  r  )rU  r5  d_window_sizer+  r  r  rE  ri  r4  pooled_depthr  r  r  r  re   rf   rz   B  s    	

8zavg_pool3d_backward.<locals>.fnrg  )rm   rA   r   r   r  rm  ri  r.  rk   r   r  fallback_avg_pool3d_backwardr>   r   rp   )r  r   ri  r  r4  rj  r5  r+  r  _d_outceil_mode_drX  ceil_mode_hrZ  ceil_mode_wr  r=  r   rp  rz   r  re   )rU  r5  ru  rl  r+  r  r  rE  rT  ri  r4  rv  r  r  r  r  r  rf   avg_pool3d_backward  sf   &%Wr|  c                 C  s   |   }t|tr|g}n|stt|}t|dkr*t|dv s(J d| g S t|}tt|D ]5}|| dk rL||  t|rHt|nd7  < d||   krZt|k sin t|dkrg|| dksiJ q4tt|t|ksxJ d|S )Nr   )re   r  r@  zinvalid axis: r-   zreduction axis not unique)r   rm   r   r  r   r   rk   r%   )r   r  r   rt   re   re   rf   _validate_reduction_axis  s    
 :r}  )r  c                  sl  |d ur	t | |} |  tt t| |}g }g g g ttD ]}||v r7| |  q$| ||  q$d|dv rtdkrt| rt	| j
tr]dn#t	| j
tjsst	| j
tjrt	| j
j
tjr|  }| p|   fdd}	rt}
D ]}tjj|
|< qn|}
|   t|  |p|  |  |	|
dS )NFr  argminr-   Tc           	        s   t |t ks
J rt  t ksJ  fddD  t  t ks)J d gt  t |  }tt t|D ]\}}|||< q@|}rwdd |D }|d }tdt |D ]}||  ||  }qa|t|tjfS |S )Nc                   r  re   re   r  ru  re   rf   r     r   z9_make_reduction_inner.<locals>.loader.<locals>.<listcomp>c                 S  rv  re   rw  r  re   re   rf   r     ry  r   r-   )	r   r  r  r  r  rP   r[  r~   r  )	rE  reduction_indexr  r^  varr  rindex
linear_idxrt   inner_loaderkeepdimskept_idxreduced_idxreduced_sizesshould_compute_logical_indexr   ru  rf   r    s$   
z%_make_reduction_inner.<locals>.loader)rw   r  r  rf  rh  r  )r  r   r%   r   r}  r  r   rq   r9   rm   ro   r=   r0   r   r   Bufferr`  is_transposedis_contiguousrk   r   r8  r9  ri  r$  rp   r   )r   r  r  r   rV  r  
kept_sizesrt   r{  r  r=  re   r  rf   _make_reduction_inner  sZ   



r  r  rR   c                   s   dd d fdd}|S )NFr   c                  sn   dv r|   tjkrt| rt| tj} t| ||| d}tjd| d|}t	|j
j
tr5|  |S )Nr~  )r  r  r   rV  r  )r  r  re   )r   r~   r[   r9   r  r  r  r?   r   rm   ro   r  )r   r  r  r   r   r  rV  r  re   rf   r    s(   zmake_reduction.<locals>.innerr   re   )r  rV  r  re   r  rf   make_reduction  s   r  c                C  sB   |d ur	t | |} t| |}t|  |  f|  f|  |dS )N)rw   dtypes	inner_fnsr   r  )r  rR  r$  rp   r   ri  r   )r   r  r   re   re   rf   _make_scan_inner$  s   

r  r   c                  s   |d ur	t | |} |   t| |}|  }|tjtjfv r$t | tj} t| ||}t	 fdd|D }t
j||  |  d}t|t| }t t|||S )Nc                 3  r  r   re   r  r   re   rf   r   =  r  zmean.<locals>.<genexpr>rD  )r  r   r}  r   r~   rS  rR  rB  sum_rN   r0   r7   rp   r6   r   rk   div)r   r  keepdimr   rF  
sum_resultdenomre   r  rf   r  2  s   

r  c           
        s   |d u rd}|    t| |}t| |dd}|r|  tt| |}t|||}t fdd|D }|r>t	|| d}t
j||  |  d}t|t|  }t||}	|s^|	fS |rb|nt||}|	|fS )Nr-   T)r  c                 3  r  r   re   r  r  re   rf   r   P  r  z var_mean_sum_.<locals>.<genexpr>r   rD  )r   r}  r  r  squarer  r  rN   r   rJ  r0   r7   r   rp   r6   r   rk   r  r  )
r   r  
correctionr  return_meanx_meandiffsr  r  x_varre   r  rf   var_mean_sum_C  s&   

r  c                 C  sz   t | |}t| ||d d d}|d }t|d }|  }|r#|jdks'tj}ntjj}t	|t
jo<t||ko<t|dkS )Nr  r  r   rV  rh  r  r   r-   )r}  r  rN   rp   r   r.   r  cppuse_two_step_variance_thresholdrm   r   rf  r   )r   r  r  r   rh  reduction_numelrw   	thresholdre   re   rf   use_two_step_variance]  s   



r  c                  s    d u rd t | ||d d d}|d}|d |d tjjd|fd|  d|\}}}	|  |  |  t| |}t	fdd	|D d
d  fdd}
t
|
|}|rj|  ||fS |fS )Nr-   r  rf  r  r  welford_reduce)r  r  r   c                 3  r  r   re   r  r  re   rf   r     r  z$var_mean_welford_.<locals>.<genexpr>c                 S  s4   t | tjr| jstt| tj|S t	| |S r   )
rm   r   r   	is_numberrP   r  r[  r~   r  r\  r  re   re   rf   get_constant_or_index_expr  s   z5var_mean_welford_.<locals>.get_constant_or_index_exprc                   s4    }}t d}| t |||  S r  )rP   r\  r  )ro   cNzero)r  r   r  rnumelre   rf   r    s   

z#var_mean_welford_.<locals>.scale_fnre   )r  r  r0   WelfordReductionr   r   r  r   r}  rN   r  )r   r  r  r  r  r   r  r  m2r  r  r  re   )r  r   r  r  r   rf   var_mean_welford_u  s6   




r  c                  s   |    t }t| |dd} t| ||||d}tjjs#t| ||dr*tdi |nt	di |}t
 fdd|D }|sB|d S |S )	NFr  )r   r  r  r  r  )r  r  c                 3  s    | ]
}t | d dV  qdS )Fr  Nr  r   r  re   rf   r     r  z#var_mean_helper_.<locals>.<genexpr>r   re   )r   r   r  r$  r.   mtiadisable_welford_reductionr  r  r  r   )r   r  r  r  r  r   r   r?  re   r  rf   var_mean_helper_  s$   
r  )r  r  c                C  r   )NFr  r  r  r  r  r   r  r  r  re   re   rf   var_     
r  c                C  r   )NTr  r  r  re   re   rf   var_mean  r  r  c                 C  st   |dk rt t| | |S |dkrtd|S |dkr| S t | |d |}t||}|d dkr8t|| }|S )Nr   r-   r+   )pow_recursiverP   r  r\  r  )r   r@  r   r  re   re   rf   r    s   r  c                 C     t | |S r   )rP   r  r  r>  re   re   rf   
pow_native  r<  r  )r   c                   sR  t tr rt tS t trdkrt S t tr*dkr*t S tdd  fD }t|}t toOd  k oGdk n  pO|oOdk}|rm 	  fdd	}t
j    |  d
S t  tr dkr{tdS  dkrt rtS |rt  trt S t trt S t S t S )Nr  r-   c                 s  s$    | ]}t |tjr| V  qd S r   )rm   r0   rA   r   r   re   re   rf   r     r6  zpow.<locals>.<genexpr>i    r   c                   s   t |   S r   )r  r   r   r  r>  r  re   rf   rz     r  zpow.<locals>.fnrg  r+   )rm   rB  r   r  r   sqrtr  rQ  r   ri  r>   r   rp   r   r   r   r  r   exp2fallback_pow_scalarfallback_pow_tensor_scalarfallback_pow_tensor_tensorr  )r  r>  r   is_integer_powembed_exponentrz   re   r  rf   r    s@   
"







r  c                 C  s   t | tr	| j}n| }t |tr|j}t |tjs>tj|  |  |	 | 
 d}t |ttfs3J |j}t |tjs>J t |tjr]| s]| s]t |jtjs]|  |j|_| S tjj|||d | S )Nrg  unsafe_alias)rm   rA   ro   r0   r   r>   r   rp   r   ri  r   r4   r:   is_input_bufferis_module_buffer	NopKernelr  r  realize_into)changedr  r  changed_datarc   re   re   rf   r,    s:   

r,  c                 C  s   t | t| |S r   )r,  r  )r   rv  re   re   rf   r?  8  r  r?  c                 C  @   | |u r| S t ||  }t||  }t||  }t| |S r   r  rp   r  r   r<  r   r,  )r  r-  r  re   re   rf   r  =     
r  c                 C  r  r   )rP   floordivr  re   re   rf   r  H  r<  r  c                 C  r  r   )rP   truncdivr  re   re   rf   r  M  r<  r  c                 C  r  r   )rP   div_rnr  re   re   rf   _div_rnR  r<  r  c                 C  s   t | ot |}t| ot|}|dkr(|rJ d|r!t| |S tt| |S |dkr@|r2J d|r9t| |S tt| |S t| |S )Nr'  z5floordiv operands can not be boolean at the same timer)  z5truncdiv operands can not be boolean at the same time)r   r   r  r'  r  r  r)  r  )r  r>  rounding_modeboth_integerboth_booleanre   re   rf   div_modeW  s   
r  c                 C  s8   t | ot |}|rt| |S ttjj}t|| |S r   )r   logical_andr<   r  r  r  r  )r  r>  	both_boolrz   re   re   rf   r  k  s
   
r  r  ir.Constant | Nonec              	   C  s   t | tjrt| jS t | tjrt|  S t | tjr| S t | tjs'dS t	j
j|  }t|' ttjdd | j|   }W d   n1 sPw   Y  W d   n1 s_w   Y  t |t	j
jjsnJ t |jtjrx|jS dS )z:Try convert an arbitrary IR node into an ir.Constant valueNallow_indexingT)rm   r0   r:   get_constant_valuero   r4   r|  r  Loopsr~   	_inductorops_handlerExtractConstantsHandlerrp   rQ   set_ops_handlerr   objectr  rf  inner_fn_argsvirtualizedOpsValuer  )r   rY  rr   re   re   rf   r  u  s*   
 r  c                 C  s   t dd | |fD }|rt| |S t| }d ur:|  jdkr:|jdkr0ttd|j}nd|j }t	| |S dd }t
|| |S )	Nc                 s  s     | ]}t |pt|V  qd S r   )r   r   r   re   re   rf   r     r  zdiv_prim.<locals>.<genexpr>r   r   infr  c                  W  r  r   )rP   r  r  re   re   rf   rz     r  zdiv_prim.<locals>.fn)r-  r  r  rp   r   r  mathcopysignrB  r  r  )r  r>  is_integralrI  r  rz   re   re   rf   rC    s   



rC  c                 C  s    t | |ftjd\} }t| |S rx  )rY  r   INT_TO_FLOATrC  r  re   re   rf   r    s   


r  c                 C  s4   t | pt| }|rdd }ndd }t|| |S )Nc                 S  r  r   )rP   modr  re   re   rf   rz     r`  zfmod.<locals>.fnc                 S  r  r   )rP   fmodr  re   re   rf   rz     r`  )r   r   r  )r  r>  r  rz   re   re   rf   r    s
   
r  c                C  B   t |  st|  r|d u rtj}td|d}|| |||dS )Nr  r  r   r   r   r   r~   r  r  r   r  r  r   rz   re   re   rf   r       

r  c                 C     t |  st|  r|d u rtj}t|  dkr.|dv s!J |p&|  }t| |ddS dd }t| ||d}t	j
jd
i |d|i\}|d u rRt| ||d	S |S )Nr   r   ro  Tr  c                 S     | \}|\}t ||fS r   )rP   r   a_tupleb_tupler  r>  re   re   rf   
combine_fn     zcumsum.<locals>.combine_fnr  r   r  r   r   re   )r   r   r   r~   r  r   r   r  r  r0   Scanr   fallback_cumsumr   r  r   r  r   r  re   re   rf   cumsum      

r  c                 C  r  )Nr   r  Tr  c                 S  r  r   )rP   r  r  re   re   rf   r     r  zcumprod.<locals>.combine_fnr  r  r  re   )r   r   r   r~   r  r   r   r  r  r0   r  r   fallback_cumprodr   re   re   rf   cumprod  r  r  c                 C  sv   dd }|   }t|  dkr|dv sJ t| S t| ||d}tjjdi |d|i\}|d u r9t| |dS |S )	Nc              	   S  s\   | \}|\}t ||}t ||}||kt | B }t |t t || | |fS r   )rP   r  r  r  r   log1pexp)r  r  r  r>  min_vmax_vr  re   re   rf   log_add_exp_helper  s   $z(logcumsumexp.<locals>.log_add_exp_helperr   r  r  r  rb  re   )	r   r   r   r  r  r0   r  r   fallback_logcumsumexp)r   r   r	  r   r   r  re   re   rf   logcumsumexp  s   r  c                      t |  dkr dv sJ t| t| tjdfS |  }tjd|dd}t	|  |d}|tjf|d< | 
  fd	d
f|d< tjjdi |d|i\}}|d u rXt|  dS ||fS )Nr   r  r   r  Fr   arg_break_ties_leftr  r  c                      t |   tjS r   rP   r[  r~   r  r   r  re   rf   rL  1  rM  zcummax.<locals>.<lambda>r  r  rb  re   )r   r   r  r  r~   r  r   r0   get_reduction_combine_fnr  ri  r  r   fallback_cummaxr   r  r   r  r   r  r  re   r  rf   cummax"      
r  c                   r  )Nr   r  r   r  Fr  r  r  c                   r  r   r  r   r  re   rf   rL  H  rM  zcummin.<locals>.<lambda>r  r  rb  re   )r   r   r  r  r~   r  r   r0   r  r  ri  r  r   fallback_cumminr  re   r  rf   cummin9  r  r  c                C  r  )Nr  r  r   r  r  re   re   rf   r  P  r  r  c                 C  s   t | tj} td| ||dS )Nr.  r  r  )r  r~   r[   r  r   r   r  re   re   rf   
reduce_any[  s   r  c                 C  2   |d urt | ||dt| ||dfS t | d |dS Nr  )reduce_amaxreduce_argmaxr  re   re   rf   
reduce_maxa  
   r   c                 C  r  r  )reduce_aminreduce_argminr  re   re   rf   
reduce_minl  r!  r$  xor_sumr  r  r  r  r  
logical_or)r  r  r  stabler   
descendingc             	   C  sL  |d u rd}|   }|  }tt||}t|dkr't| td|tj|fS t|r/|| nd}tj	j
r9tj}ntj}tjj|t|jsPt| |||dS t|dd||dd}dgt| }	t|ri||	|< t||	}t||}tjj|| j|jf|  | f||||d\}
}|
d u rt| |||dS |d usJ |
t|tjfS )NFr   r-   r'  rM  )rw   r  r  r   r  r(  r)  )r   rp   r   r   r  rw  r~   r  r.   tritondecompose_sort_opsr  int16rQ   r]   r   statically_known_ltr  r  sort_fallbackrK  r  r<  r0   Sortr   r   ri  r  )r   r(  r   r)  r/  rw   rF  	idx_dtyper  
view_shaper  re   re   rf   sort_stable  sD   



	r2  c                 C  s   t | d||dS )NFr'  )r2  )r   r   r)  re   re   rf   sort  r  r3  c                 C  sb   t jjst| S |  }ttj|t	
d}t| |g}t|dd\}}|d d }t|d|S )Nr-   r   rb  r+   )r.   r*  r+  median_fallbackr   r"  r  r  r  r   rf  r  r2  r6  )r  r   r  flatsorted_valsr  r   re   re   rf   median_default  s   r7  c                 C  s   t jjs
t| ||S |  }t|}|dkr$t| td|  t	j
|fS t||}t| d|d\}}|| }|d d }t|||}	t|||}
|rTt|	|}	t|
|}
|	|
fS )Nr   Tr(  r   r-   r+   )r.   r*  r+  median_dim_fallbackr   r   r  rw  rp   r~   r  r   r2  r6  r.  )r  r   r  r/  rE  r6  sorted_idxsr  r   r  r  re   re   rf   
median_dim  s    


r;  c                   s  t jjs
t| ||S |  }t|}|  }|dkr&t| td|t	j
|fS t||}t| d|d\}}|| }t|ddt	j
|dd}	tdg| }
||
|< t|	|
}	t|	|}	|	 fdd}tjt|t	j
||d	}t|||}| | |	 fd
d}tjt|t	j||d	}|  |	  fdd}tjt|t	j
||d	}t||\}}|	 | fdd}tjt|t	j
||d	}t||dd}t|||}t|||}|st||}t||}||fS )z9Lower aten.mode via sort-based decomposition or fallback.r   Tr8  r-   FrM  c              	     s,   t t  | t dtjt dtjS Nr-   r   )rP   r  r  r\  r~   r  r   )positions_loader0re   rf   prev_pos_fn  s   z!mode_default.<locals>.prev_pos_fnrg  c              
     s2   t t | | t  | t dtjS r  )rP   or_nerO  r\  r~   r  r   )positions_loadershifted_loadersorted_loaderre   rf   is_boundary_fn  s   z$mode_default.<locals>.is_boundary_fnc                   s    t  | | t dtjS r-  )rP   r   r\  r~   r  r   )is_boundary_loaderpositions_loader2re   rf   boundary_pos_fn.  s
   z%mode_default.<locals>.boundary_pos_fnc                   s&   t t |  | t dtjS r  )rP   r   r  r\  r~   r  r   )last_boundary_loaderpositions_loader3re   rf   
run_len_fnC  s   z mode_default.<locals>.run_len_fnr  )r.   r*  r+  mode_fallbackr   r   rp   r  rw  r~   r  r   r2  rK  r   rf  r  r<  ri  r>   r   rE   r  r[   r  r  r  )r  r   r  r/  rE  rw   r6  r:  r  	positionspos_view_shaper>  prev_positionsshifted_valsrD  is_boundaryrG  boundary_poslast_boundaryr  rJ  run_lenmax_pos	mode_vals	mode_idxsre   )rE  rH  rA  r=  rF  rI  rB  rC  rf   mode_default  s|   




rW  c                 C  s   t jjst| ||||S |  }t|}|dkr&t| td|  t	j
|fS t||}t| d||d\}}t||d|}	t||d|}
|	|
fS )Nr   Tr'  )r.   r*  r+  topk_fallbackr   r   r  rw  rp   r~   r  r   r2  ru  )r  r   r   largestr  r/  rE  r6  r:  r  r  re   re   rf   topk\  s   

rZ  c           
      C  s   t jjst| |||S |  }t|}|dkr%t| td|  t	j
|fS t||}t| d|d\}}t|||d }t|||d }	|rOt||}t|	|}	||	fS )Nr   Tr8  r-   )r.   r*  r+  kthvalue_fallbackr   r   r  rw  rp   r~   r  r   r2  r6  r.  )
r  r   r   r  r/  rE  r6  r:  r  r  re   re   rf   kthvaluem  s   


r\  c                 C  s   t | |tj|dS )Nr   r   r  )r  r   r  )r`   r   r  re   re   rf   register_pointwise_numeric  s   r^  torch._ops.OpOverloadPacketc                 C  s   t | j t| tjdS rx  )rM   r  r  r   r  r  re   re   rf    register_pointwise_numeric_ldf64  s
   
r`  )r  )r  c                     t | ||tjd |  | | |  } jo)tjj o)|duo)|j	dv  fdd}t
j|   ||  dS )az  
    Computes self + value * tensor1 * tensor2 using FMA for better precision.

    Matches eager CUDA kernel order: self + value * (tensor1 * tensor2)
    This is computed as: fma(value, tensor1 * tensor2, self)

    Note: FMA is only used for floating-point types on non-AMD GPUs. For integer types,
    we fall back to regular arithmetic since FMA doesn't support integers.

    For floating-point types, we use mul_rn (round-to-nearest multiplication)
    to force rounding of the product before the FMA. This prevents Triton's
    compiler from fusing the multiplication with the FMA, matching eager's
    rounding behavior.
    rZ  Nrn  xpuc                   s   | }| }| }dkrrt |||S r"t ||}nt ||}ttjr5t  }nt  }rDt |||S t 	|t ||S r  )
rP   r]  mul_rnr  rm   r   r   r[  r\  r   )r^  self_valt1_valt2_valt1_times_t2
value_exprr   r  	t1_loader	t2_loaderuse_fmar  re   rf   rf    s   zaddcmul.<locals>.inner_fnrg  )r   r   rP  ri  rp   r  r~   r  r  r   r>   r   r   r  tensor1tensor2r  rw   rf  re   rj  rf   addcmul  s0   rq  c                  ra  )a  
    Computes self + value * (tensor1 / tensor2) using FMA for better precision.

    Matches eager CUDA kernel order: self + value * (tensor1 / tensor2)
    This is computed as: fma(value, tensor1 / tensor2, self)

    For value=1: self + tensor1 / tensor2 (no FMA needed, just add the division)
    For value!=1: fma(value, div_rn(tensor1, tensor2), self)

    Note: FMA is only used for floating-point types on non-AMD GPUs. For integer types,
    we fall back to regular arithmetic since FMA doesn't support integers.

    We use div_rn (round-to-nearest division) to force proper rounding, preventing
    Triton from fusing operations in ways that change the rounding behavior.
    rZ  Nrb  c                   s   | }| }| }rt ||}nt ||}dkr%t ||S ttjr2t  }nt  }rAt 	|||S t |t 
||S r  )rP   r  r  r   rm   r   r   r[  r\  r]  r  )r^  re  rf  rg  	t1_div_t2ri  rj  re   rf   rf    s   zaddcdiv.<locals>.inner_fnrg  )r   r   r  ri  rp   r  r~   r  r  r   r>   r   r   rn  re   rj  rf   addcdiv  s0   rs  r  r  logical_not)r  )r   r   rV  identity)r  pointwise_overrides_datac                 #  s    t |  t|  jd }|d u rd S  fdd}t|tjjr6| D ]}t||}| j||fV  q#d S | j||fV  d S )Nc                   s    j d u r	t| S d S r   )r*  r  r  ri  re   rf   make_triton_fallback  s   
z6_get_pointwise_overrides.<locals>.make_triton_fallback)	rv  r   r   rm   r~   r   r   r   r   )nsr   r`   rw  olnamer  re   ri  rf   _get_pointwise_overrides  s   
rz  r]  c                   s,   | t |< t|   fdd}t| | d S )Nc                    sB    | i |}g }t | d |D ]\}}|t||dd q|S )Nr   Tr  )r  rq   r,  )ru   r   resultsmut_resultsr   r  outplace_opre   rf   rz     s
   z$register_foreach_inplace.<locals>.fn)rY   r  r   r'  )aten_opoutplace_aten_opr~  rz   re   r}  rf   register_foreach_inplace  s   
r  c                   s   t | d d fdd}|S )NrZ  c                    s.    | i |}t || d  }t| d |S r  )r  r   r,  )ru   r   r  r}  re   rf   rz     s   zregister_inplace.<locals>.fn)r4  )r  r~  rz   re   r}  rf   register_inplace  s   
r  c                 C  rp  r   re   r  re   re   rf   sym_constrain_range7  r	  r  c                 C  *   t jjjd }t|tjr|jjS t	|S Nr  
rQ   r]   r^   r  rm   r~   r$  rc   r%  r   r  r   r  re   re   rf   ro  <     ro  c                 C  r  r  r  r  re   re   rf   
sym_strideE  r  r  c                 C     |   S r   )r  r  re   re   rf   	sym_numelN  r  r  c                  G  s0   t | dkrt| d ttfr| d } tj|  S r<  )r   rm   rk   r   r   Addr  re   re   rf   sym_sumW  s   
r  c                 O  r  )NzHelpful for debuggingr   )r  ru   r   re   re   rf   foobar`  r  r  c                 C  s   |    t| S r   )r  r  r   re   re   rf   _realizee  s   r  c                 C  s   |    t| | | S r   )r  r0   ResizeStorageBytes)variabler=  re   re   rf   resize_storage_bytes_k  s   r  c                 C  s"   |    |   tt| |S r   )r  rA   r   r0   SetSourceTensorKernel)r  source_tensorre   re   rf   set__source_tensorr  s   r  c                 C  r  r   r  )r  r-  re   re   rf   
fsdp_copy_{  r  r  c                  s  t | tsJ t |ttfsJ |d u rtj}|tjkr#td| |tjkr0t	|dks0J |tj
kr=t	|dks=J |  |  }|  }t ritjjjrit|r[tdnt|rft|jndndtjjdr{t|||dS |  }|d uotd	d
 |D }|rt| g}nt | jt j!r| j" | _t#| gdg}|$  t j%&||}t '||||(  fdd}	t)j*|||	t|d}
|
S )Nzunsupported memory format: r   r   nanTr  r   r4  c                 s  s     | ]}t jj|d V  qdS rR  )rQ   r]   r   rU  rx  re   re   rf   r     s    
zresize.<locals>.<genexpr>r-   c                   sH   |  t  tj}t tj}t ||}t | fddS )Nc                     s
    gS r   re   re   )
flat_indexflat_loaderre   rf   rL    s   
 z*resize.<locals>.inner_fn.<locals>.<lambda>)rP   r[  r~   r  r  r  )r^  flat_index_exprlimitr  r  	old_numelout_indexeruninitialized_val)r  rf   rf    s
   zresize.<locals>.inner_fnrg  )+rm   rA   rk   r   r~   contiguous_formatpreserve_formatr  channels_lastr   channels_last_3dr  r   rT  r  r  deterministicfill_uninitialized_memoryr   rB  r   r  r  rQ   r]   r   rU  r  r:  r.  r  ro   r0   r4   r|  rd  ri  r   stride_ordered_for_memory_formatr  r  r>   r   )r   r   rC  r   rw   r  has_overlappingx_flat
out_striderf  rr   re   r  rf   resize  sT   



r  )auto_functionalizedc                 C  sB   ddl m} ||}tj| ||i ||d dd | D S )Nr   )kernel_side_table)
kernel_idxgridtma_descriptor_metadatakernel_argsc                 S  s    i | ]\}}t |tr||qS re   r   )r   r  r  re   re   rf   r	    r  z'triton_kernel_wrap_.<locals>.<dictcomp>)*torch._higher_order_ops.triton_kernel_wrapr  get_constant_argsr0   UserDefinedTritonKernelr  )r  constant_args_idxr  r  r   r  constant_argsre   re   rf   triton_kernel_wrap_  s   	

r  -list[ir.TensorBox | ir.ShapeAsConstantBuffer]c                 C  sj   t dd | g|D r$d}tjjjdd  }r | d| }|tj_tj	| |||}t
ttj	|S )Nc                 s  rk  r   rl  r   re   re   rf   r     rC  zcond.<locals>.<genexpr>z"control flow operator: torch.cond.r  r  )r.  rQ   r]   r^   r  r  r  r0   Conditionalr   rk   maprA   )predtrue_fnfalse_fnoperandsr   r  r  re   re   rf   r     s   c                 C  s   t js&tdd || D r&d}tjjjdd  }r"| d| }|tj_t	j
| ||||}t|ts7J ttt	j
j|S )Nc                 s  rk  r   rl  r   re   re   rf   r     s
    
zwhile_loop.<locals>.<genexpr>z(control flow operator: torch.while_loop.r  r  )r.   r  r.  rQ   r]   r^   r  r  r  r0   	WhileLoopr   rm   r   rk   r  _maybe_wrap_as_tensor_box)cond_fnbody_fncarried_inputsadditional_inputsstack_outputr   r  r  re   re   rf   
while_loop  s   
r  )r  subgraph_fnir.Subgraph
identifierc                 G  s$   t jj| g|R  }tttj|S r   )r0   InvokeSubgraphr   rk   r  rA   )r  r  r  r  re   re   rf   invoke_subgraph   s   r  r  torch.fx.GraphModulec              	   C  s   t }t| jjD ]X\}}|jdkr"|tjjvsJ || tjj|< q|jdkr;tj|\}}tj	j
tj|||}q|tjjvsCJ tjj}z|tj_tj|tjj|< W |tj_q|tj_w |t u ritd|S )a  Process nodes from a FX graph by executing them through V.graph.

    This is a common pattern for executing a subgraph's nodes:
    - Placeholder nodes are mapped to the provided args
    - Output nodes return their result
    - Other nodes are executed via V.graph.run_node

    rj  r?  zNo output node found in graph)_MISSINGrl   r]   nodesr`   rQ   envfetch_args_kwargs_from_envr~   r  Interpreterr?  r^   run_noder  )r  ru   r?  rt   rc   output_argsr   saved_current_nodere   re   rf   process_subgraph_nodes   s$   	

r  )control_depsc                 G  s   g }| D ]}t |tsq|  ||  qtjjj}d}t	|| t	|ks+J t	tjj
}t	|jjjddt	|ksAJ t|jt|}| sMJ tjj
|d D ]}	|D ]}
|	j}|dusdJ tjj| |
 qYqU|S )aS  
    Lower control_deps_op by ensuring dependencies are realized and tracking them.

    The control_deps_op HOP makes dependencies explicit in the graph. During lowering:
    1. Realize all additional dependencies to ensure they're computed
    2. Execute the target operation normally
    3. Track the dependencies for the scheduler
    r+   rj  r  N)rm   r8   r  rq   r  rQ   r]   r^   ru   r   
operationsr  
find_nodesr  rk   operation_nameadditional_buffer_depsr   )additional_depsr  ru   	dep_namesdeporiginal_args
arg_offsetoperation_lenr?  r`   dep_nameop_namere   re   rf   control_deps_op_lowering;   s(   

 	r  )schemec          
      G  s   d }t jjjdd }|d usJ t| jjjD ]Y\}}|jdkr*|| t jj	|< q|jdkrgt j
|\}}t|| D ]}	|	  |jrQt jj|	  t jj|	  q?tjjt j|||}qt j|t jj	|< q|S )Nquant_optionsrj  r?  )rQ   r]   r^   r  r  rl   r  r  r`   r  r  r  r  r  r  codegen_low_precisionlow_precision_codegen_opsr   r  invoke_quant_opsr~   r  r  r?  r  )
r  r  r  r?  r  rt   rc   ru   r   r   re   re   rf   invoke_quant_tracerj   s"   

r  r  r  tuple[torch.Tensor]c                   s   ddl m m} t|dkrtd fddt||D }|| |fdd}t|d dd d	}td
d |D |d< tdd |D |d< t	j
jd|dd|}|d d u r_td|S )Nr-   )InputDescriptorlower_pointwise_subgraphr   zSUnable to generate code for associative_scan op, because there are lifted argumentsc                   s    g | ]} |  | d qS )r4  )r   rp   r   )r  re   rf   r      s    z$associative_scan.<locals>.<listcomp>c                   s    g t | t |R  S r   )rY  rk  )lhsrhs)lowered_combine_fnre   rf   wrapped_combine_fn   s
   z,associative_scan.<locals>.wrapped_combine_fnr  c                 s  r  r   r  r   re   re   rf   r      r  z#associative_scan.<locals>.<genexpr>r  c                 s  r  r   rq  r   re   re   rf   r      r  r  F)r  can_fallback_to_atenz/Unable to generate code for associative_scan opre   )r  r  r  r   r  r  r  r  r   r0   r  r   )r  xsr  r  subgraph_inputsr  r   r  re   )r  r  rf   associative_scan   s,   


r  c                 C  rp  r   re   )tokensre   re   rf   _sink_tokens   r	  r  c                   C  rp  r   re   re   re   re   rf   _make_token   r	  r  c              
   O  sb  ddl m}m} ||}|du rP|tjjju rPddlm}m	} |
 }	|	rP|	jtjjj}
|
rPt|
|s7J |
|d }|rPt|dksJJ dtt|}ttjj}|tv rmt| |i |}ttdd | nd	d
 }t|tjj|g|R i |}ttjj|d dksJ d| d|rtjj|}tjj|d D ]}dd |_|r|  }tjj!| "|   q|tjj|< zdd }t|||f\}}||||}W n6 t#y } z)t$|}t%&d|| t|t't(fr| g|R W  Y d}~S | |fW  Y d}~S d}~ww t|j)dkr| |fS t|j)dkr+| |fS | g|R S )z
    We lower the operator directly, and then we add StarDep dependencies to all
    the newly created nodes in the graph.
    r   )_get_effect_get_schemaN)InvokeSubgraphCacheTracingContextr-   zMultiple effects NYIc                 S  r  r   )r  r  re   re   rf   rL     rU  zwith_effects.<locals>.<lambda>c                 S  rP  r   rQ  r   re   re   rf   rS     rT  z"with_effects.<locals>.wrap_tensorszCNo operation nodes were generated when lowering effectful operator .c                   S  s   dS r   re   re   re   re   rf   rL     rO  c              	   S  sp   t | tjr
|  S t | tr6z| j}t|dr$t|jdr'|j W S W | S W | S  tt	fy5   Y | S w | S )Nro   get_example)
rm   r0   TorchBindObject	get_valuerA   ro   r   r  AttributeErrorr   )r  r  re   re   rf   convert_ir_to_value   s$   
z)with_effects.<locals>.convert_ir_to_valuez5Failed to get schema for %s: %s. Assuming list output)*torch._higher_order_ops.effectsr  r  r~   rP   higher_orderr  torch._guardsr  r  try_gethop_dispatch_set_cache	get_cacherm   get_effectsr   rQ  r0  rQ   r]   r  rV   rY  r
  rA   rU  r0   rV  r   effectful_opsr  has_side_effectsr  additional_star_depsr   r  r   r  r  r   rk   returns)tokenr`   ru   r   r  r  effect_typer  r  tracing_ctxinvoke_subgraph_cacheeffectsr  r  rS  prev_effect_buffernew_opr  r  schema_argsschema_kwargsschemae	error_msgre   re   rf   with_effects   sr   



r  )register_comm_loweringsregister_symm_mem_loweringsc                 C  s   t | |dddd}|d }tjjt|}tjjdi |d|d\}}|dkrEtjj	|t
jrEtjd| d|d	|\}}||fS td
 t| |dd}	ttj t| |	}
t|
|dd}|	|fS )zn
    Lowering inductor_prims.prepare_softmax_online to compute max/sum in one pass if no split is needed.
    TNr  r  online_softmax_reduce)r  r  r-   r+   )r  
num_outputreduction_hintzTOnline softmax is disabled on the fly since Inductor decides to split the reduction.)r  re   )r  rQ   r]   r   simplifyrN   r0   r?   
num_splitsstatically_known_geqr.   r  r;   r   r  debugr  rV   r  r  r  r  )r   r   r   r  r  hint	num_split
max_tensor
sum_tensorr  r  xsumre   re   rf   prepare_softmax_online#!  s6   


r(  c                   C  s   t j ot j dkS )z.Check if we're on SM100+ hardware (Blackwell).)r   r   )r~   rn  is_availableget_device_capabilityre   re   re   rf   _is_sm100_or_laterQ!  s   r+  c                 C  s   t  std|  }|tjtjtjfvrtd| |tjkr't| tj} t	j
tjddtjddd}t|| }t|tjS )z
    Lowering for cvt_e8m0_rceil. Uses PTX cvt.rp.satfinite.ue8m0x2.f32 on SM100+.

    The PTX instruction takes 2 float32 and outputs 2 e8m0 packed in uint16.
    Currently we pass 0.0 as the second input and only use the low byte result.
    zFcvt_e8m0_rceil requires SM100+ (Blackwell) for PTX instruction supportzAcvt_e8m0_rceil requires float32, float16, or bfloat16 input, got z)cvt.rp.satfinite.ue8m0x2.f32 $0, 0.0, $1;z=h,rTr-   )asmconstraintsr   is_purepack)r+  r   r   r~   r  rS  rR  r  r  r"  r3  rP   inline_asm_elementwiseuint16r  r  )r   r   rz   r  re   re   rf   cvt_e8m0_rceil_loweringV!  s*   
r2  )r.  r/  c                   sh   t | }tdd |D dd |D  fdd}tjj|d  |t|d  dS )	Nc                 s  r  r   r  rm  re   re   rf   r   !  r  z/lower_inline_asm_elementwise.<locals>.<genexpr>c                 S  rp  re   rq  rm  re   re   rf   r   !  r   z0lower_inline_asm_elementwise.<locals>.<listcomp>c              	     sR   t  fddD }tj|d}tjtjfv r't|tj}|S )Nc                 3  s    | ]}| V  qd S r   re   )r   r  r   re   rf   r   !  r  zAlower_inline_asm_elementwise.<locals>.inner_fn.<locals>.<genexpr>)r,  r-  r   r.  r/  input_dtypes)r   rP   r0  r~   rS  rR  r  r  )r^  valsr  asm_strr-  r   r3  r.  r}  r/  r   rf   rf  !  s   z.lower_inline_asm_elementwise.<locals>.inner_fnr   rg  )r  r   r0   r>   r   rp   rk   r   )r6  r-  r   r.  r/  rU  rf  re   r5  rf   lower_inline_asm_elementwise|!  s   
r7  rW  )quantized_lowerings)mkldnn_lowerings)jagged_loweringsc              	   c  st    t | tjjsJ dt| }zt| t|  dV  W |r&|t| < dS t|  dS |r4|t| < w t|  w )z^
    A context manager to force fallback an op. Used in unit test
    for FallbackKernel.
    z+Only OpOverload to make the clean up easierN)	rm   r~   r   r   rV   r  r4  r  r  )r`   old_handlerre   re   rf   force_fallback!  s   

r<  r  )rh   ri   rZ   rj   )rz   r{   rZ   r|   )r   r   rZ   r   )r   r[   r   r   rZ   r   )rz   r   rZ   r   )rz   r   r   r   rZ   r   )r   r   rZ   r   )r   r   rZ   r   )r   r   rZ   r   )ru   r   r   r   r   r[   rZ   r   )r`   r   r   r   rZ   r[   )r   rA   rw   r   rZ   rA   )ru   r   r   r   r   r[   r   r   r   r[   rZ   r   )r   r  r  r{   rZ   r{   )
r  r{   r   r[   r   r   r   r[   r(  rU   )r   r   rZ   r2  rL  )NNNFFN)Frd  )FT)r   rA   r   r   r  r[   rv  r[   )r   rA   r   r   )r   rA   rw   r   )F)r   rA   r  rA   r   )r   rA   r  rA  rZ   rA   r  )r  rA   r  rA   r  rA   r  r   r  r   r  r   r   r   rZ   rA   )r  rA   r  rA   r  rA   r  r   r  r   r  r   r   r   r  r  rZ   rA   )r  rA   r  rB  r  r   r  r   r  r   r   r   rZ   rA   )r  rA   r  rB  r  r   r  r   r  r   r   r   r  r  rZ   rA   )r  rA   r  rA   r  rA   r  r   r  r   r   r   rZ   rA   )r  rA   r  rA   r  rA   r  r   r  r   r   r   r  r  rZ   rA   )r   r   r-   )ra  r   r  r   r  r   r@  )T)rx   r_  )rc   ri  )NTFN)rw   r   )
r   r  r  rA   r  r   ra  r   r  r   )
r  r   r  r   r   r  r  rA   ra  r   )r  rA   rZ   r  )r  rA   rZ   r  )r  rA   r  rA   r  r[   r  r[   r  r  r  r  rZ   rA   )r  rA   r  rA   r  r[   r  r[   )r-   )r   r   rE  r   )r   NNr-   )NNN)ro  FF)r   r   )ry  r  r   r   r  r  r  r[   )r   r   r  r  )r   r   r  r[   )r+   F)r  r  r  r   r  r[   )r  r"  )r)  r"  r*  r"  )r.  r"  r)  r"  r*  r"  )r   rA   r4  r5  rv  rB  rZ   r  )rt   r]  r  r^  )rt   r]  r  r]  )rt   r]  r  r]  r  r]  )Nr  N)
r  rA   ri  r  r  r  r  r  rZ   rA   )Nr   r-   F)r  )NNNN)re   r   FTN)r  rR   r   )r   r  rZ   r  )ro  F)ro  TT)r`   r_  )rZ   r  )r  r  r  r   )r  r  ru   r   )r  r  )r  r  r  r  r	  (B  
__future__r   rA  r  r"  r  loggingr  r  r}  rV  r\  collectionsr   collections.abcr   r   r   r   typingr   r	   r
   r   r   typing_extensionsr   unittest.mockr   r   r~   $torch.ao.quantization.fx._decomposedtorch.fxtorch.utils._pytreer  _pytreerY  torch._dynamo.utilsr   (torch._higher_order_ops.associative_scanr   r  r   "torch._library.fake_class_registryr   torch._library.opaque_objectr   torch._library.utilsr   torch._prims_commonr   r   r   r   r   r   r   r   r   r   r   torch.fx.experimental.sym_noder    r!   rQ  r"   r#   r$   torch.utils._ordered_setr%   torch.utils._sympy.functionsr&   r'   r(   r)   r*   _dynamo.utilsr,    r.   r/   r0   r1   decompositionr2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   r  rP   rQ   r  rR   rS   rT   FALLBACK_ALLOW_LIST	getLoggerr  r  rV   __annotations__rW   rX   r   r   r*  r  tr_c10dr  r   _higher_order_opsr  rb   r  rY   quantized_decomposedrg   ry   r   r   r   r   r   rd  r  r  r\  bmmconvolutionconvolution_backwardr  r  r  r6  r,  r-  _int_mmr  r  r,  r  r  rS  r  r  	complex32	complex64r[   rR  r   r   r   r   r   r   r   r   r  r'  r1  rP  r4  rA  rY  rj  r  r  r  r  r  r  r  r  r   r  r  
device_putr  r  r  r  r  r  r  r   r  aliasdetachdetach_liftview_ofr  r   r
  r  r  r  r  r"  r$  r'  r(  r  r)  r<  r2  r3  r@  _unsafe_viewreshaperB  slicerW  ru  r  r  quantize_per_channelr  r  r   r  _functional_assert_asyncr  dequantize_per_channelr  quantize_per_tensorr  dequantize_per_tensorr  rH  r  r  r  r(  r)  r.  r6  r:  r;  r?  rI  r.  rK  rR  rO  r  cacher^  rd  rh  rv  r  r  rngprimsr  r  r  	bernoullir  r  r  	lru_cacher  r  r  r  r  r  r  r  r  randint	rand_like
randn_likerandint_likestreamsrecord_event
wait_eventsynchronize_eventsynchronize_deviceforce_stride_orderr  r  r  r  r  lookup_seedr  r  randomr  rand_eager_offsetrand_eager_offsetsr  r  r  r  r  r  	NO_OPMATHr  r  r  r   r
  r  r   r   rA  _adaptive_avg_pool3dadaptive_max_pool3d*_scaled_dot_product_attention_math_for_mpsuniformexponential_pdist_forwardsoft_margin_loss_backward_fused_rms_normrc  _is_compiledembedding_dense_backwardr  native_layer_norm_cdist_forward_cdist_backward
_trilinearsegment_reduce_segment_reduce_backwardhistc	histogrambin_ct_histogramdd_bin_edges_histogramdd_from_bin_ctsaddbmm_addmm_activation_grouped_mm
_cudnn_rnn_cudnn_rnn_backward
miopen_rnnmiopen_rnn_backward_embedding_bag_embedding_bag_forward_only_embedding_bag_backward*_embedding_bag_per_sample_weights_backward_fused_moving_avg_obs_fq_helper*_fused_moving_avg_obs_fq_helper_functional max_pool3d_with_indices_backward_adaptive_avg_pool2d_backward_adaptive_avg_pool3d_backwardadaptive_max_pool2d_backwardadaptive_max_pool3d_backwardfractional_max_pool2d_backwardfractional_max_pool3d_backwardreplication_pad1d_backwardreplication_pad2d_backwardupsample_linear1d_backwardupsample_bicubic2d_backwardupsample_trilinear3d_backwardgrid_sampler_2d_backward_pdist_backward	nanmedianr  resize_
resize_as__linalg_detlinalg_householder_productlinalg_inv_exlinalg_ldl_factor_exlinalg_ldl_solve	linalg_lulinalg_lu_factor_exlinalg_lu_solvelinalg_matrix_exp	linalg_qr_linalg_slogdet_linalg_solve_exlinalg_solve_triangular_linalg_svd	lu_unpackormqr_linalg_check_errorslinalg_pinvatol_rtol_tensor_linalg_eightriangular_solvelinalg_cholesky_excholesky_inversecholesky_solvegeqrf_fft_r2cnonzerogcd_thnn_fused_lstm_cell_prims	rng_primsrun_and_save_rng_staterun_with_rng_stategraphsafe_run_with_rng_staterun_dtensor_rng_opmasked_scattermasked_scatter_backwardrr  angle_efficientzerotensor(_sparse_coo_tensor_with_dims_and_tensors	to_sparse
_to_sparser   r{  r6  r&  #_scaled_dot_product_flash_attention	quantized,_scaled_dot_product_flash_attention_backward#_scaled_dot_product_cudnn_attention,_scaled_dot_product_cudnn_attention_backward+_scaled_dot_product_flash_attention_for_cpu4_scaled_dot_product_flash_attention_for_cpu_backward0_scaled_dot_product_fused_attention_overrideable9_scaled_dot_product_fused_attention_overrideable_backward_flash_attention_forward_flash_attention_backward_efficient_attention_forward_efficient_attention_backwardindex_reducerepeat_interleave_weight_norm_interface_backwardr  r  rF  rs  rK  arange
start_steprN  rR  rZ  r[  scalar_tensorrh  
LongTensorrj  ro  rq  rr  rw  r  rz  r<  ry  r  r  r   
zeros_liker  r  r  r  r  r  r  r  r  r  r  r  rE  r  r  r  r  r  r  r  r  r  fallback__unsafe_masked_indexr  ,fallback__unsafe_masked_index_put_accumulaterP  r  r  r  r  r  r
  r  r  r!  r&  r(  r0  r1  r  r3  rG  rQ  r  rS  rT  r_  rc  rm  rq  rv  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r%  r&  r"  r0  r7  
avg_pool1dr@  r3  rW  r|  rw  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  Tensor_Tensorr  Scalarr  Tensor_Scalarr  r,  r?  r  r  r  r  r  r  r  r  rC  true_divider  r  r  r  r  r  r  r  r
  r  r  r  r  r  r.  r  r  r   r  r$  r%  r  r  r  r"  r  r  r  r#  r   r3  r(  r.  r2  rZ  rX  r\  r[  medianr4  r   r9  r  rK  r7  r;  rW  r^  r`  rsqrtr  r  expm1relurM  r  r  r  rq  rs  _foreach_addcmul_foreach_addcmul_scalar_foreach_addcdiv_foreach_addcdiv_scalarcossinabsbitwise_andbitwise_left_shiftbitwise_not
bitwise_orbitwise_right_shiftbitwise_xorlgammaerfspecial_erfr  tantanhr  rt  r&  logical_xorr  r  	clamp_min	clamp_maxnegr  	remaindersignsignbit	_neg_viewler  r  r  rO  r@  coshsinhacosacoshasinasinhatan2atanatanhr  erfcerfinvhypotlog10log2	nextaftercodegen.commonr  rv  rz  r   r`   r   r  _foreach_addListforeach_add_listforeach_add_scalar_foreach_mulforeach_mul_listforeach_mul_scalar_foreach_sub_foreach_neg_foreach_abs_foreach_powScalarAndTensor_foreach_divforeach_div_listforeach_div_scalar_foreach_sqrt_foreach_rsqrt_foreach_maximum_foreach_minimum_foreach_clamp_min_foreach_clamp_max_foreach_reciprocal_foreach_sign_foreach_clone_foreach_copyforeach_copyr  _foreach_add__foreach_mul__foreach_div__foreach_copy__foreach_addcmul__foreach_addcdiv_r  add_bitwise_and_bitwise_left_shift_bitwise_not_bitwise_or_bitwise_right_shift_bitwise_xor_mul_div_Tensor_modelogical_and_logical_not_logical_or_logical_xor_sub_relu_sigmoid___and__
__lshift____or__
__rshift____xor____iand____ilshift____ior____irshift____ixor__r  ro  r   r  r  r  methodfuncr  r  _inductor_testr  r  r9  r  set_source_Tensorr  fsdpr  r  *torch._higher_order_ops.auto_functionalizer  r  r  r   r  while_loop_stack_outputr3  r  r  r  r  .torch._inductor.fx_passes.control_dependenciesr  r  invoke_quantr  r  r  r  r  comm_loweringr  r  r(  r+  cvt_e8m0_rceilr2  r0  r7  rX  r8  register_quantized_opsregister_woq_mm_opsr9  register_onednn_fusion_opsr:  register_jagged_opscontextmanagerr<  re   re   re   rf   <module>   s	  4H8

		

R7Bi8!
,
/














7
 )E2



2$$2
/ &8@
	
"*9&




		8a:			 &



#D
8



,
	


 $!#I%
v



"x/


_5; K"
 
)5


E


Q
,
D1 
	 &

	 HR
+


0
%
	 











.m	II





















@
$.
#f-%
#
