o
    i9:jtv                 -   @   s^=  d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlZd dlmZ d dlmZmZmZ d dlmZmZmZmZ d d	lmZ d d
lmZmZ d dlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z5 d dl6m7Z7m8Z8 d dl9m:Z; e	dZ<edZ=ej>j?Z?ej@AdddZBeCd\ZDZEZFdd ZGdd ZHdeee=e<f gee=e<f f fddZIde!fdd ZJd!d" ZKd#d$ ZLeIe?jMe?jNge/ dddejOd%d%fd&d'ZPeIe?jQjRe?jQjSge/ d(d) ZTeIe?jUjRe?jUjSge/ d*d+d,d-ZUeIe?jVe/ d.d/ ZVeIe?jWjRe?jWjSe?jXjRe?jXjSge/d0d1d2d3 ZYeIe?jZjRe?jZjSge/ d4d5 ZZd6d7 Z[dd8ed9e\e] d:e^fd;d<Z_eIe?j`jRe?j`jSge/ d=d> ZadZbd9e\e] fd?d@ZceIe?jdjRe?jdjSge/ dAdB ZeeIe?jfjgddCdDdEZheIe?jfjRejiddddFdGdHZjeIe?jkjRe?jkjSge/ ejiddddFdIdJZleIe?jkjme?jkjnge/ ejiddddFdKdLZoeIe?jpjRe?jpjSge/ dddddFdMdNZqeIe?jrjRdOdP ZseIe?jtjRdQdR ZudSdT ZveIe?jwjRddWdXZxeIe?jyjRddYdZZzeIe?j{jRe?j{jSge/ d8ed9e\e] d[e]d\e]fd]d^Z|eIe?j}jRdd_d`Z~dadb ZeIe?jjRdcdd ZeIe?j			ddeedfedgedhedB diedB djejdB fdkdlZeIe?j	ddmednedoedjejdB fdpdqZeIe?jdrdrddsdeedmednedoedjejdB f
dtduZeIe?j				%	 	r	*ddvejdwejdhedB dxedB djejdB dye^dze]d{e]d|e]fd}d~ZeIe?jjRddd8ed9e]dedejdede^defddZeIe?jjRddd8ed9e]dedejdede^defddZe/ eIe?jjRdd ZeIe?jjRdddd d%dddedededB d1edB dedB de]de^defddZeIe?jjRe?jjge/ dd ZeIe?jjdddZeIe?jjRe?jjge/ dd ZeIe?jjdddZeIe?jjRdd ZeIe?jjSdd ZeIe?jjRdd ZeIe?jjdd ZeIe?jjRdd ZeIe?jjRddddddddZeIe?jjRdddZeIe?jjRdddZeIe?jjRdddZeIe?jjRdd ZeIe?jjdd Zd8edefddZd8ededefddZ	ddedede^fddZddededefddZdedede^defddÄZ	ĐddededeedefddȄZdefdd˄ZeIe?jjRe?jjge/dd̓ddedede^fddфZeIe?jjRe?jjSge/ deedefddӄZeIe?jge/dd̓deefddՄZdedefdd؄ZeIe?je/ d8edede^defddۄZeIe?je/ dd8edede^defdd݄ZeIe?je/ dd8ede^defdd߄ZeIe?je/ dd8ede^defddZeIe?jjRddede^de^fddZeIe?jjRe?jjSge/ deededefddZeIe?jjRddede^fddZeIe?jjRe?jjSge/dddd%d%dd8ede^de^deeeef fddZeIe?jjRe?jjSge/ d%ddededede^def
ddZeIe?jjRe?jjSge/ddddddede^deeeef fddZeIe?jjRe?jjSge/ddddd%ddede^de^deeeef fddZeIe?jjRe?jjSge/ dd%ddededede^de^defd dZeIe?j҃e/ddd		ddedede^de^deeeef f
ddZӐdedee^e^f fddZeIe?jjRe?jjSge/d	d
ddededeeef fddZeIe?jjRe?jjge/dddddedeeeeef fddZeIe?jjR	%		ddede^de^dedB fddZڐdededee\e] e\e] f fddZېdedededB deeef fddZdeedede^fddZeIe?jރdd%ddddd dedede^de^dedB dedB dedB dedB deeeeef fd!d"ZeIe?jjRe?jjSgdd%dd#dedede^de^d$e^d%edB defd&d'ZeIe?je/d(d)dd*		%	%dd8edede^d+e^d$e^deeef fd,d-ZeIe?jjRd.d/ ZeIe?je/ 		%ddeededede^d+e^defd0d1Zd2d3 Zd4d5 ZeIe?je/ d6d7 ZeIe?je/ d8d9 Zd:d; ZeIe?je/d<d=d> ZeIe?je/d<d?d@ ZdAdB ZeIe?je/ dCdD ZeIe?je/ dEdF ZeIe?jjRdGdH ZeIe?jjRe?jje?jjRe?jjge/d<dIdJ ZdKdL ZeIe?je/ dMdN ZeIe?je/ dOdP Z eIe?jjRe?jje?jjRe?jjge/d<dQdR ZeIe?je/ dd8edTedefdUdVZeIe?je/ dWed8edTedXedef
dYdZZeIe?j	jRe?j	jSge/dd*drdrd[d\d]Z
eIe?jjRe?jjSge/ ddCd^d_ZeIe?jjddadbZeIe?jjddcddZeIe?jjRe?jjSge/ ddedfZeIe?jjR	%	%ddgdhZeIe?je/dd*ddjejdB fdidjZdkdl ZddndoZ	ddpejdfejdqe\e] e]B dre\e] e]B dse\e] e]B dte^due]dve\e] e]B dB fdwdxZdydz ZeIe?jjRdpejdfejdhejdB d{ejdB d|ejdB d}e^d~edefddZeIe?jjRdpejdfejdhejdqe\e] dre\e] dse\e] dte^dve\e] due]fddZejj r	ej@AdddZ!eIej>j"j#jRdd Z$eIej>j"j%jRdd Z&ejj'
r5ej@AdddZ(eIej>j)j*dd Z+ej@AdddZ,eIej>j-j.jReIej>j-j/jReIej>j-j/j0dd Z1eIej>j-j.j2eIej>j-j.j3dd Z4eIej>j-j5jReIej>j-j5j0dd Z6eIej>j-j5j2eIej>j-j5j3dd Z7eIej>j-j8jReIej>j-j9jRdd Z:ej@AdddZ;eIej>j<j=				%dddZ>eIej>j<j?dd Z@dd ZAeIe?jBdeejdede]dejdejf
ddZCeIe?jDjR			%		dddZEdd ZFeIe?jGjRdd ZHeIe?jIe/ 			%		dddZJeIe?jKe/d<dd ZLeIe?jMjRdd ZNeIe?jOjRdd ZPeIe?jQjRdd ZReIe?jSe/d<dd ZTdedefddZUeIe?jVe/d%d1dd ZWeIe?jXe/d<dd ZYeIe?jZe/d%d1dd Z[eIe?j\e/d<dd Z]eIe?j^jddÐdĄZ_eIe?j`jRe?j`jSge/ dŐdƄ ZaeIe?jbjRe?jbjSge/ d*dǜde]fdɐdʄZbeIej>j?jcjRej>j?jcjSge/ dːd̄ ZceIe?jdje?jejgd͐d΄ ZfeIe?jgjRgdϐdЄ ZheIe?jijRe?jijSge/dd*drdrd[dѐd҄ZjeIe?jkjgdӐdԄ ZleIe?jmjRe?jnjRgddd՜d֐dׄZoeIe?jpjRgddd՜dؐdلZqeIe?jrge/ dڐdۄ ZseIe?jtgdܐd݄ ZueIe?jvgdސd߄ ZweIe?jxgdd ZyeIe?jzgdd Z{eIe?j|gdd Z|de]de]de]fddZ}dd Z~eIe?jgdhedB fddZeIe?jgdd ZeIe?jgdd ZeIe?jjRdd ZeIe?je/ dd ZeIe?jjR	%	 	%		%	*dddZeIe?jjRdd ZdddZeIe?jjRe?jjSge/ dddddZeIe?jjRe?jjRgdd  ZeIe?jje?jje?jje?jje?jjRe?jjge/d0d1dddZeIe?jjRdd ZeIe?jjRdd ZeIe?jjRdd ZeIe?jje?jje?jje?jje?jjRe?jjRe?jjRgd	d
 ZeIe?jje?jje?jje?jjgdddZeIe?jje?jjgdddZeIe?jjRe?jjgdd Zdd ZeIe?jje?jjgdd ZeIe?jje?jjgdd ZeIe?jjRdd ZeIe?jje?jjgdd ZeIe?jje?jjgdd ZeIe?jjRdd ZeIe?jje/ ddefdd ZeIe?jge/ 	dd#d$ZeIe?jg	dd%d&ZeIe?jg	dd'd(ZeIe?jjRe?jjRgdd)d*ZeIe?jŐjd+d, ZeIe?jjRd-d. ZeIe?jɃd/d0 ZeIe?j˃e/ d1d2 ZeIe?j̓d3d4 ZeIe?jjRdd5d6ZАdd7d8ZeIe?jjRe?jjSge/dd*d9d: ZeIe?jje?jҐjge/dd*d;d< ZՐd=d> Z֐d?d@ ZאdAdB ZؐdCdD Z	%ddeedEe]dFe]dGe]dHe]dIe]dJe]dKe]dLe]dMe]dNe]dOe]dPe]dQe]dRe]dSe]dTe]dUe]dVe]dWe]dedXe^f,dYdZZڐd[d\ ZdeededEe]dFe]dGe]dHe]dIe]dJe]dKe]dLe]dMe]dNe]dRe]dSe]dTe]dUe]dVe]dWe]def&d]d^Zܐd_d` ZeIe?jރe/d<dadb ZeIe?jjR				%ddcddZeIe?jjRdedf ZeIe?je/d%d1				%ddgdhZeIe?je/d<didj ZdeedkefdldmZG dndo doeZdeedkedpe]fdqdrZeIe?jjRdsdt ZeIe?je/ dudv ZeIe?je/d<dwdxdy ZeIe?jjRgdzd{ ZeIe?jjR					dd|d}ZeIe?jjRe?jjSge/ ddddd%d~ddZeIe?jjRe?jjSge/ ddddd%d~ddZeIe?jjRdd ZeIe?jjRdddZdd Zdd9e]de]de^fddZdd Zdd ZeIe?jjRdddZdddZdddZ dd ZdddZdddZeIe?jjRdd ZeIe?jdd ZeIe?jj	e?jj
e?jje?jjge/ dddZeIe?jj	e?jj
e?jje?jjgdddZeIe?jjRg	U	%	%	ddedededede^de^dedB fddZeIe?jj<g	U	%	%	ddededededB dedB dedB dede^de^dedB fddZdedee]df fddZeIe?jg	U	%	%	ddededededB de^dede^de^dedB fddZeIe?jg		U	%	%	ddededededB dede^de^dedB fddZeIe?jdddededededede\e^ d%edededede]de]dede^dedededB f"dƐdǄZeIe?jg	ddedededed%edededede]de]dede^dedededB fdȐdɄZeIe?jg	U	%		ddedededede^dedB dedB fdːd̄ZeIe?jg		ddedededed%ededede^dedB dedB fd͐d΄ZeIe?j g		U	%			%ddededededB dede^dedB dedB de^deeef fdѐd҄Z!eIe?j"g	U	%	ddededededB de^de^dedB fdӐdԄZ#eIe?j$g	%	ddedededededB d%ededededede\e^ de^dedB fdՐdքZ%eIe?j&g	ddedededed%ededededededede]de]dede^dedB f dאd؄Z'eIe?j(jRg							ddededededB dedB de]de]dede^de^dedB de]dB de]dB dedB dedB dedB de]dB f"dߐdZ)eIe?j*jRg							dd%edededededB dedB de]de]dede^de^dedB de]dB de]dB dedB dedB dedB de]dB f$ddZ+eIe?j(j<g					ddededededB dedB de]de]dede^de^dedB dedB dedB dedB de]dB de]dB dedB dedB f$ddZ,eIe?j-g			ddedededed%edededede]de]dede^dedededB de]dB de]dB f"ddZ.eIe?j/g	%				ddedededhedB dedB dedB de]dB de]dB dede]de^dedB dedB dedB de]dB fddZ0eIe?j1g			%ddededededhedB dedB dedB dej2dej2dedededede]de^dedB de]dB de^f$ddZ3				%dd8ejdoejdejdejdhejdB dejdB djejdB de^fddZ4eIe?j5jRg				%dd8ejdoejdejdejdhejdB dejdB djejdB de^fddZ6					%dd8ejdoejde\ej de\e7 de\ej de\e7 dhejdB djejdB d e\e8 dB de\e8 dB de^fddZ7eIe?j8jRg				%dd8ejdoejde\ej de\e7 d e\e8 de\ej de\e7 de\e8 dhejdB dejdB de\e] dB de^fddZ9eIe?j:j;e?j:j<ge/ ddd	Z=eIe?j>j;dd
dZ?eIe?j@jRe?j@jSge/ dddCddZAdd ZBdd ZCeIe?jDjRe?jEjRgdddZDeIe?jFjRe?jGjRgdddZFeIe?jHjRe?jIjRg		ddedee]ej2B  dee]ej2B  dedB dedB f
ddZHeIe?jJjRe?jKjRgdddZJeIe?jLjRe?jLjMe?jLje?jLjNgdŐddZOd d! ZPeIe?jQjR		dd"d#ZReIe?jSjRd$d% ZSeIe?jTjRd&d' ZTeIe?jUjRd(d) ZUd*d+ ZVd,d- ZWeIe?jXjRe?jYjRgdd.d/ZZeIe?j[jRdƐd0d1Z[eIe?j\jRdǐd2d3Z]eIe?j^e/ 	dȐd4d5Z_eIe?j`jRe?j`jge/d0d1dd6d7ZaejbZcd8d9 ZdeIe?jejRd:d; ZeeIe?jfjRd<d= ZfeIe?jgjRd>d? ZheIe?jijRd@dA ZieIe?jjje?jjjkge/ d%d%dBdCdDZleIe?jjje?jjjmgd%d%dBd8e&dEedFe^dGe^fdHdIZneIe?joge/ dɐdKdLZpeIe?jqjRe?jrjRe?jsjRg		ddMdNZteIe?jujRe?jvjRg		ddOdPZweIe?jxjRdQdR ZxeIe?jyjRe?jyjSge/ ddSdTZyeIej>j?jzdUdV ZzeIej>j?j{dWdX Z{eIe?j|e/ d%d%dddYdZd[Z}d\d] Z~eIe?jd^d_ ZeIe?j	*dʐd`daZeIe?j	*dʐdbdcZeIe?j	*dʐdddeZeIe?je/ d%d%dfdgdhZeIe?je/ die]d8edefdjdkZeIe?jd8efdldmZeIe?je/dd*d8edefdndoZeIe?je/ d8edefdpdqZdrds Z					%dĐdteduedejdB dejdB dvedB dhedB dejdB djejdB de^fdwdxZeIe?je/ 			ddteduedvedB dhedB djejdB defdydzZeIe?jg					%dĐdtejduejdejdejdvejdB dhejdB dejdB djejdB de^fd{d|ZeIe?jjdːd}d~ZeIe?je/ ded9e]de^defddZeIe?je/ dddZeIe?je/ 	*	%	%ddfed1ede]de^de^defddZeIe?jjR	Udd0ede\e de\e] defddZdd ZeIe?jjR				dΐddZdd Zee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j ee?j dd ZeIe?je/ dd ZeIe?je/ drdddZeIe?je/ drdddZee?jZee?jZee?jZd dl0Zd dlZd dlZdd Ze  dS (      N)CallableSequence)Enum)wraps)TypeVar)	ParamSpec)SymBoolSymFloatTensor)_add_op_to_registry_convert_out_paramsglobal_decomposition_table
meta_table)
OpOverload)_prim_elementwise_meta$ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND)BoolLikecorresponding_complex_dtypecorresponding_real_dtypeelementwise_dtypesELEMENTWISE_TYPE_PROMOTION_KIND	FloatLikeIntLikemake_contiguous_strides_forNumber
NumberTypesuggest_memory_formatsym_min
TensorLike)_maybe_convert_to_dtype_maybe_resize_out_resize_output_check_safe_copy_outout_wrapper)_broadcast_shapes_maybe_broadcast)_config)ScalingTypeSwizzleType)_pytree_T_PatenIMPLMeta   c                 C   s   | | d | S N    abr2   r2   `/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/_meta_registrations.pyceil_div9      r7   c                 C      | | d | | S )z$Rounds up x to nearest multiple of yr1   r2   xyr2   r2   r6   round_up=   s   r=   returnc                    s    fdd}|S )Nc                    s$   t    fdd}t|  S )Nc                    s   t t|   d S N)r   r   opfnr2   r6   registerF   r8   z0register_meta.<locals>.wrapper.<locals>.register)r   pytree	tree_map_)rC   rD   r@   rB   r6   wrapperC   s   zregister_meta.<locals>.wrapperr2   )rA   rG   r2   r@   r6   register_metaB   s   	rH   type_promotionc                    s>   t j|d| i\}  fdd|D }t| }t|dtjiS )Ntype_promotion_kindc                    s   g | ]}t | qS r2   )r   .0r;   result_dtyper2   r6   
<listcomp>X       z$elementwise_meta.<locals>.<listcomp>rI   )utilsr   r%   r   r   DEFAULT)rI   args_r2   rM   r6   elementwise_metaO   s   
rU   c                 C   s(   t jt jt jt jt jt ji}|| | S r?   )torch	complex32halfcfloatfloatcdoubledoubleget)dtypefrom_complexr2   r2   r6   toRealValueTypec   s
   r`   c                    s2   t tg|R   t k fdd d S )Nc                         d d  S )Nzoutput with shape z# doesn't match the broadcast shape r2   r2   broadcasted_shape
self_shaper2   r6   <lambda>p       z)check_inplace_broadcast.<locals>.<lambda>)tupler$   rV   _check)rd   
args_shaper2   rb   r6   check_inplace_broadcastl   s
   rj   Fc	           	         s*  t tjrt dkdd  t tjr$t dkdd  tdd fD rMtt  d u r> ntt	 fdd npRt t tj
sbtdt tt tfd	d t tstd
t tdkdd  tjf|d||dS )Nr   c                   S      dS Nz:linspace only supports 0-dimensional start and end tensorsr2   r2   r2   r2   r6   re          z(meta_linspace_logspace.<locals>.<lambda>c                   S   rk   rl   r2   r2   r2   r2   r6   re      rm   c                 s   s    | ]}t |tV  qd S r?   )
isinstancecomplex)rL   argr2   r2   r6   	<genexpr>   s    z)meta_linspace_logspace.<locals>.<genexpr>c                         d  d S )Nzlinspace(): inferred dtype z& can't be safely cast to passed dtype r2   r2   )default_complex_dtyper^   r2   r6   re      rf   zdtype must be torch.dtype, got c                      s*   dt j dt  j dt j dS )Nz4received an invalid combination of arguments - got (, ))type__name__r2   )endstartstepsr2   r6   re      s    zsteps must be IntLike, got c                   S   rk   )Nz$number of steps must be non-negativer2   r2   r2   r2   r6   re      rm   metar^   layoutdevice
pin_memoryrequires_grad)rn   rV   r
   rh   dimanyrQ   r   get_default_dtypeis_complex_dtyper^   AssertionErrorrv   _check_typer   empty)	ry   rx   rz   baser^   r~   r}   r   r   r2   )rs   r^   rx   ry   rz   r6   meta_linspace_logspacet   sL   


r   c                    sN   t  jt jk fdd t |  dko  dk dd  |  jS )Nc                         d j  S )Nz2take(): Expected a long tensor for index, but got r^   r2   indexr2   r6   re          zmeta_take.<locals>.<lambda>r   c                   S   rk   )Nz*take(): tried to take from an empty tensorr2   r2   r2   r2   r6   re      rm   )rV   rh   r^   long_check_indexnumel	new_emptyshape)selfr   r2   r   r6   	meta_take   s   

r   r   c                   sh   j }j }t||kdd  t dko dk fdd tjj}|S )Nc                   S   rk   )Nz=linalg.cross: inputs must have the same number of dimensions.r2   r2   r2   r2   r6   re      rm   zlinalg_cross.<locals>.<lambda>r/   c                      s"   d  d   d   S )Nzlinalg.cross: inputs dimension z must have length 3. Got  and sizer2   r   otherr   r2   r6   re      s
   )ndimrV   rh   r   r$   r   r   )r   r   r   x_dy_d	out_shaper2   r   r6   linalg_cross   s   
r   c                 C   s$   t | d t| d tj| tjdS )Nzlinalg.matrix_expmemory_format)squareCheckInputscheckFloatingOrComplexrV   
empty_likecontiguous_formatr   r2   r2   r6   linalg_matrix_exp   s   

r   valuesindicesc                 C   sV   t j| j| j| jd}t j| j| jt jd}|  dkr'| jdkr't|| j ||fS )Nr~   r^   r   )	rV   r   r   r~   r^   int64r   r   maybe_wrap_dim)r   r   r   r   r2   r2   r6   	cummaxmin   s
   r   c                 C   s   t || j tj| tjdS Nr   )r   r   rV   r   r   )r   r   r2   r2   r6   logcumsumexp   s   r   c                   s  |j }t|}|| }tt|}dd t|D  |D ]}	d |	< q fdd|D }
|
t| }t|
}| |d | }|jfdddd |||d   }||}dgt|j|d   }||}|	d	}||d	< t|}tt|D ]}|||  ||d
 < q|| j
|tjd dd t|D }d
}|d
 }|d	kr|| d	 ||| < ||||  9 }|d
8 }|d	kst||D ]}| d
||  ||| < q| |||   | S )Nc                 S      g | ]}d qS Fr2   rL   rT   r2   r2   r6   rO      rf   z_exec_fft.<locals>.<listcomp>Tc                    s   g | ]} | s|qS r2   r2   )rL   d)is_transformed_dimr2   r6   rO         c                        |  S r?   r2   r;   self_stridesr2   r6   re   	      z_exec_fft.<locals>.<lambda>keyreverser   r   r1   r   c                 S   r   r   r2   r   r2   r2   r6   rO     rf   )r   lenlistrangestridesortpermuter   reshaper   resize_rV   r   as_strided_storage_offset)outr   	out_sizesr   forwardr   signal_ndim
batch_dimsdim_permuter   left	batch_endtmpinputbatched_sizes
batch_sizebatched_out_sizesiout_stridesbatch_numelr2   )r   r   r6   	_exec_fft   sF   



r   r   r   exclude_lastc                    sJ   t |}|   t|t| }t|d |  fdddd|d |< |S )Nc                    r   r?   r2   r   r   r2   r6   re   ,  r   z_sort_dims.<locals>.<lambda>Tr   )r   r   r   intsorted)r   r   r   sorted_dimsrx   r2   r   r6   
_sort_dims'  s   r   c           
         s  t | jj |s|  S t| dkr t jj s | 	| 
 S | 
 }| 	|}t| dkr=t| |}t|| |||dS t|}| }	 |  |j fdddd ttt|}|t|| d  }	t||||	|d |d t||  }|s{|S || u r|}| 	|}n||}}qD)Ncpucudar   Tc                    r   r?   r2   r   stridesr2   r6   re   I  r   zmeta_fft_c2c.<locals>.<lambda>r   )rV   rh   r^   
is_complexclonedevice_hintbackendsmklis_availabler   r   r   r   r   r   r   mincufft_max_ndimr   )
r   r   normalizationr   r   outputr   working_tensormax_dims	last_dimsr2   r   r6   meta_fft_c2c3  s4   


r   c                 C   s8   t | tkst | dkr| d dkr| d dkrdS dS )N   r   r1   FT)r   r   r   r2   r2   r6   use_optimized_cufft_path]  s   0r   c                    s  t | jj t|  }t|}|d }|| d d }t|}|||< |r+|||< t| dks7t| dkr| j|t	| jd}	| }
t| dkrXt
|rXt|	|
||dd ngt|dkr`|n|}t|	|
||gdd t|dkr}| j|t	| jd}
|d d }|r|
|	}	}
|
  |j fd	d
dd ttt|}|t|| d  }t|	|
||dd |d t||  }|s|s|	||| kr|
j|t jd |
}	|	S t jj rt| |dd}| j|t	| jd}	t|	| ||ddS | j|t	| jdS )Nr   r   r1   r   xpur   Tr   c                    r   r?   r2   r   r   r2   r6   re     r   zmeta_fft_r2c.<locals>.<lambda>r   r   )r   )rV   rh   r^   is_floating_pointr   r   r   r   rQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   onesidedinput_sizesr   last_dimlast_dim_halfsizeonesided_sizesr   r   target_sizesr   r   r   r2   r   r6   meta_fft_r2cd  sd   

r   )	generatorc                C   s   t |t| gS r?   )r    rV   Size)nr   r   r2   r2   r6   meta_randperm  s   r  r^   r}   r~   r   c                C      t j| ||||dS Nr  rV   r   )r  r^   r}   r~   r   r2   r2   r6   meta_randperm_default  s   	
r  c                   s2   dt  k fdd t j|||||dS )Nr   c                      ra   Nz:random_ expects 'from' to be less than 'to', but got from=z >= to=r2   r2   highlowr2   r6   re     rf   zmeta_randint.<locals>.<lambda>r  rV   rh   r   )r  r   r^   r}   r~   r   r2   r
  r6   meta_randint  s   
r  c                   s.   t  k fdd t j|||||dS )Nc                      ra   r	  r2   r2   r
  r2   r6   re     rf   z"meta_randint_low.<locals>.<lambda>r  r  )r  r  r   r^   r}   r~   r   r2   r
  r6   meta_randint_low  s   
r  c                C   r  r  r  )r   r^   r}   r~   r   r2   r2   r6   meta_rand_default  s   
r  c                    s   t   dko jd dk fdd t  jt jk fdd t dkfdd  jd d } g|dR S )	Nr1   r   r   c                      r   )Nz>_philox_key_split: key must have shape (*batch, 2), got shape r   r2   r   r2   r6   re     r   z'meta_philox_key_split.<locals>.<lambda>c                      r   )Nz3_philox_key_split: key must have dtype uint64, got r   r2   r  r2   r6   re     r   r   c                      
   d  S )Nz4_philox_key_split: num_splits must be positive, got r2   r2   )
num_splitsr2   r6   re        
 )rV   rh   r   r   r^   uint64r   )r   r  batch_sizesr2   )r   r  r6   meta_philox_key_split  s   



r  c                    sP   t   dko jd dk fdd t  jt jk fdd t  S )Nr1   r   r   c                      r   )Nz@_philox_key_fold_in: key must have shape (*batch, 2), got shape r  r2   r  r2   r6   re     r   z)meta_philox_key_fold_in.<locals>.<lambda>c                      r   )Nz5_philox_key_fold_in: key must have dtype uint64, got r   r2   r  r2   r6   re     r   )rV   rh   r   r   r^   r  r   )r   datar2   r  r6   meta_philox_key_fold_in   s   



r  c                    s   t jjfdd t  jt jk fdd t j jk fdd t   dko: jd dk fdd   dkr~t    d k fd	d  jd   t td
d t	jD fdd d S d S )Nc                           dj  S )Nz,: self must be a floating point tensor, got r   r2   )op_namer   r2   r6   re     rf   z1_check_philox_distribution_args.<locals>.<lambda>c                          d j  S )Nz": key must have dtype uint64, got r   r2   r   r  r2   r6   re     rf   c                      s    dj  d j  S )Nz/: self and key must be on the same device, got r   r~   r2   r   r  r   r2   r6   re     
   r1   r   r   c                      r  )Nz5: key must have shape (2,) or (*batch, 2), got shape r  r2   r  r2   r6   re     r8   c                      s    d j  dj  S )Nz?: batched key must have ndim == output ndim + 1, got key shape z with output shape r  r2   r   r2   r6   re   &  r!  c                 s   s$    | ]\}}|d kp||kV  qdS r1   Nr2   )rL   ksssr2   r2   r6   rq   -     " z2_check_philox_distribution_args.<locals>.<genexpr>c                      s    dt   dj S )Nz: key batch shape z( is not broadcastable with output shape r   r   r2   )	key_batchr  r   r2   r6   re   .  s   )
rV   rh   r^   r   r  r~   r   r   allzip)r  r   r   r2   )r   r'  r  r   r6   _check_philox_distribution_args  s6   

r*                ?c                 C      t d| | | S )N_philox_normal_r*  )r   r   meanstdr2   r2   r6   meta_philox_normal_5     r2  c                 C   r-  )N_philox_uniform_r/  )r   r   r  r  r2   r2   r6   meta_philox_uniform_;  r3  r5  r   lastdimc           
      C   sd  t | jj t| dkr[t|  }|||d < | j|t| jd}t	|r5t
|| jt jd||ddS t|dkrHt| |d d ddd}n| jt jd}t
||||d gddS t jj r| }t|dkr}|d d }t| ||dd}|dd  }t| }|||d < | j|t| jd}	t
|	|||ddS t|  }|||d < | j|t| jdS )	Nr   r   r   r   Fr   r1   r   )rV   rh   r^   r   r   r   r   r   r`   r   r   r   r   r   r   r   r   r   )
r   r   r   r6  r   r   tempr   c2c_dimsr   r2   r2   r6   meta_fft_c2rA  s<   	r9  c                 C   sV   ddl m} || st| dkrtdt|tr)|| |}tj	|| 
  | S )Nr   free_unbacked_symbolsr1   zQmore than one element of the written-to tensor refers to a single memory location)%torch.fx.experimental.symbolic_shapesr;  rV   _debug_has_internal_overlapRuntimeErrorrn   r
   to_refsexpandr   )r   srcnon_blockingr;  intermediater2   r2   r6   
meta_copy_p  s   
rE  c                 C   sX   t |  }t |  }||  krdn|| ||  }||d ||| ||fS r0   )r   r   r   r   insert)tensorr   result_sizesresult_strides
new_strider2   r2   r6   inferUnsqueezeGeometry  s    rK  c                 C   s0   t ||  d }t| |\}}| || | S r0   )r   r   rK  r   )r   r   g_sizes	g_stridesr2   r2   r6   meta_unsqueeze_  s   rN  r   weight_metabias_activation_opt	out_dtypec           	      C   s  t | j}|d ur#|d|dkr#td|d d|d |d| dd krAtd|d d| dd  |d|d< t| jdkrZtd	t| j d
d| df}|d ur{| jtjkrp|tjks{td| j d| | j	|||d u r| jn|d}|S )Nr   z%output size mismatch: weight.size(0)= != bias.size(0)=r1   r   r   zweight.size(1)=z != input.size(-1)/2=z0we can only handle the squashed input case, got D inputzKout_dtype is only supported for i8i8->i32 linear operator, got input.dtype=, out_dtype=r   )
r   r   r   r   r   r^   rV   int8int32new_empty_strided)	r   rO  rP  rQ  rR  rS  output_sizestransposed_stridesr   r2   r2   r6   meta_sparse_structured_linear  s6   
	r\  mat1	mat1_metamat2c                 C   s  t | jdkrtdt | j dt |jdkr$tdt |j dt |jdkr6tdt |j d| d|dd krTtd| d d	|dd  | d|dg}|d urx|jtjkrm|tjksxtd
|j d| |j||d u r|jn|d}|S )Nr   mat1 must be 2D, got Dmat1_meta must be 2D, got mat2 must be 2D, got r1   r   mat1.size(1)= != mat2.size(0)/2=Jout_dtype is only supported for i8i8->i32 linear operator, got mat2.dtype=rV  r   	r   r   r   r   r^   rV   rW  rX  r   )r]  r^  r_  rS  rZ  r   r2   r2   r6   meta_sparse_structured_mm  s*   rh  r1   )alphabetarS  c          	      C   sh  t | jdkrtdt | j dt |jdkr$tdt |j dt |jdkr6tdt |j dt |jdkrHtdt |j d| d	|d	krbtd
| d	 d|d	 |d|d	d krtd|d d|d	d  |d	|dg}|d ur|jtjkr|tjkstd|j d| |j||d u r|jn|d}|S )Nr1   zKonly input broadcasted to columns of mat1 * mat2 product is supported, got rU  r   r`  ra  rb  rc  r   zUonly input broadcasted to columns of mat1 * mat2 product is supported, input.size(0)=z != mat1.size(0)=rd  re  rf  rV  r   rg  )	r   r]  r^  r_  ri  rj  rS  rZ  r   r2   r2   r6   meta_sparse_structured_addmm  sB   rk  compressed_Adense_Bri  transpose_resultalg_idsplit_ksplit_k_modec	                 C   sF  |j tjtjtjtjtjhvrtd|j  | j |j kr)td| j  d|j  t|j	dkr;tdt|j	 d| j tjtjfv }	|	rN|
 rNtd|d}
| d	}|d urp||d	krptd
| d|d	 |d ur|	r|tjtjtjtjhv std| j  d|j  d| d|r|
|fn||
f}|j||dS )NzA_cslt_sparse_mm only supports fp16, bf16, int8, and fp8e4m3, got z%inputs must have the same dtype, got r   r   z-_cslt_sparse_mm only supports 2d inputs, got ra  z.dense input must be transposed for 8bit dtypesr1   r   zbias size mismatch: m=rT  zout_dtype is not supported for z x z -> z matmul!r   )r^   rV   float32float16bfloat16rW  float8_e4m3fnr   r   r   is_contiguousr   rX  r   )rl  rm  rQ  ri  rS  rn  ro  rp  rq  is_8bit_input_typer  moutput_shaper2   r2   r6   meta__cslt_sparse_mm  sX   


rz  T)include_selfr   sourcereducer{  c                C      t j| t jdS r   rV   r   r   r   r   r   r|  r}  r{  r2   r2   r6   meta_index_reduce[  s   
r  c                C      | S r?   r2   r  r2   r2   r6   meta_index_reduce_h  s   
r  c                 C   s.   t |  }|  dkr| ||< | |S Nr   )r   r   r   r   r   )r   r   r   result_sizer2   r2   r6   meta_index_selectv  s   
r  )lengthsr   offsetsaxisunsafeinitialr  r  r  r  r  c          
         sf   |d urt d fdd}|d ur||jS |d ur/|jd d |jd d f }	||	S td)Nz?segment_reduce(): indices based reduction is not supported yet.c                    s(   t j| j d d   jdt jdS )Nr1   r{   r^   r~   r   )rV   r   r   r^   r   )lengths_shaper  r  r2   r6   segment_reduce_lengths_tensor  s   z:meta_segment_reduce.<locals>.segment_reduce_lengths_tensorr   r1   z<segment_reduce(): Either lengths or offsets must be defined.)NotImplementedErrorr   r>  )
r  r}  r  r   r  r  r  r  r  r  r2   r  r6   meta_segment_reduce  s   
r  c                 C   
   |  dS Nr2   r   r   r2   r2   r6   meta_max     
r  c                 C   6   t | j|f}t| ||}| || j|tjdfS Nr   rQ   reduction_dimsr   _compute_reduction_shaper   rV   r   r   r   keepdimry  r2   r2   r6   meta_max_dim  
   r  c                 C   r  r  r  r   r2   r2   r6   meta_min  r  r  c                 C   r  r  r  r  r2   r2   r6   meta_min_dim  r  r  c                 C   s4   |   r
t| j}n	t| tjd\}}tj| |dS NrJ   r   )r   r   r^   r   r   INT_TO_FLOATrV   r   )r   rN   rT   r2   r2   r6   
meta_angle  s   
r  c                 C   s$   t ||  | j |t | S r?   )rV   _resize_output_r   r~   copy_angle)r   r   r2   r2   r6   meta_angle_out  s   r  c                 C      d S r?   r2   )valr2   r2   r6   assert_async     r  c                 C   r  r?   r2   )r  
assert_msgr2   r2   r6   assert_async_meta  r  r  c                 C   r  r?   r2   )sr2   r2   r6   
print_meta  r  r  r^   r}   r~   r   r   c                 C   s   t jdddS )Nr2   r{   r  r  r  r2   r2   r6   make_dep_token  s   	r  c                 C   s4   ddl m} t| ttfrtd|| ||d d S )Nr   )constrain_range'Constraining SymFloat or Symbool is nyir   max)r<  r  rn   r	   r   
ValueError)r   r   r  r  r2   r2   r6   sym_constrain_range  s   r  c                 C      t j| ||d |S Nr  )r,   r  r   r   r  	dep_tokenr2   r2   r6   functional_sym_constrain_range     r  c                 C   s   ddl m} |d u r|d u rt| dk d S t| ttfr"tdt| t	u r@|d ur3t| |k |d ur>t| |k d S || ||d d S )Nr   )_constrain_range_for_sizer  r  )
r<  r  rV   rh   rn   r	   r   r  rv   r   )r   r   r  r  r2   r2   r6   sym_constrain_range_for_size   s   r  c                 C   r  r  )r,   r  r  r2   r2   r6   'functional_sym_constrain_range_for_size  r  r  c                 C   s   |S r?   r2   )r  r  r  r2   r2   r6   functional_assert_async_meta  r  r  f_namec                    sL     dk rt  d   tddk fdd d S )Nr   z8: The input tensor must have at least 2 dimensions, got r   c                      s"     d d d d dS )Nz5: A must be batches of square matrices, but they are r   by r   	 matricesr   r2   r  r   r2   r6   re   (  s
    z#squareCheckInputs.<locals>.<lambda>)r   r   rV   rh   r   )r   r  r2   r  r6   r      s   r   Anamec                    s   t j jk fdd t j jk fdd t  d dk fdd t  ddk fdd d S )Nc                         dj  d j  dS )Nz:Expected b and A to be on the same device, but found b on z
 and A on 	 instead.r  r2   r  r   r2   r6   re   3  
   z(linearSolveCheckInputs.<locals>.<lambda>c                      r  )Nz=Expected b and A to have the same dtype, but found b of type z and A of type r  r   r2   r  r2   r6   re   ;  r  r   r  c                      s   d  d d  d dS )Nz3A must be batches of square matrices, but they are r  r  r   r  r   r2   r  r2   r6   re   C  s
   c                      s:   d d  d d  d d d d d 
S )NzIncompatible matrix sizes for z: each A matrix is r   r  z but each b matrix is r  r   r2   r  r  r   r2   r6   re   K  s   )rV   rh   r~   r^   r   )r   r  r  r2   r  r6   linearSolveCheckInputs0  s    


r  tallow_low_precision_dtypesc                    s^   | j  t|  p|   fdd |s-t tjtjtjtjfv  fdd d S d S )Nc                          d  S )Nz<: Expected a floating point or complex tensor as input. Got r2   r2   r^   r  r2   r6   re   \      z(checkFloatingOrComplex.<locals>.<lambda>c                      r  )Nz*: Low precision dtypes not supported. Got r2   r2   r  r2   r6   re   a  r  )	r^   rV   rh   r   r   rZ   r\   rY   r[   )r  r  r  r2   r  r6   r   T  s   r   arg_namec                    s"   t |  dk fdd d S )Nr   c                          d  dS )Nz: The input tensor z! must have at least 2 dimensions.r2   r2   r  r  r2   r6   re   i  rf   zcheckIsMatrix.<locals>.<lambda>)rV   rh   r   )r  r  r  r2   r  r6   checkIsMatrixf  s   
r  Br   c                    sZ   t   t tr ddkn	 ddk fdd d S )Nr  r   c                      sH    drdnd d  d d  d d d d d d	S )
Nz2: Incompatible shapes of A and B for the equation zAX = BzXA = Bz (r  r;   r   r   ru   r   r2   r  r  r  r   r2   r6   re   r  s   
z#checkInputsSolver.<locals>.<lambda>)r   r  rV   rh   r   )r  r  r   r  r2   r  r6   checkInputsSolverm  s   

*r  resultfn_nameresult_namec                    s&   t jjk fdd d S )Nc                	      s$     d d dj  dj  	S )Nz: Expected z5 and input tensors to be on the same device, but got z on z and input on r  r2   r  r   r  r  r2   r6   re     s   z!checkSameDevice.<locals>.<lambda>)rV   rh   r~   )r  r  r   r  r2   r  r6   checkSameDevicez  s   
r  UPLOc                    s8      }tt dko|dkp|dk fdd d S )Nr1   ULc                      r  )Nz1Expected UPLO argument to be 'L' or 'U', but got r2   r2   r  r2   r6   re     r  zcheckUplo.<locals>.<lambda>)upperrV   rh   r   )r  UPLO_uppercaser2   r  r6   	checkUplo  s
   
r  eigenvalueseigenvectorsr  	compute_vc                 C   sp   t | d t| t| j}|r | |}||t|dd n| dg}|  | j|t| j	d}||fS )Nzlinalg.eighF	row_majorr   r   )
r   r  r   r   r   r   r   popr`   r^   )r  r  r  r   vecsvalsr2   r2   r6   meta__linalg_eigh  s   


r  c                 C   s@   t | d t| jr| jnt| j}| j| jd d |dS )Nzlinalg.eigvalsr   r   )r   rQ   r   r^   r   r   r   )r   complex_dtyper2   r2   r6   meta__linalg_eigvals  s   


r  c                 C   s|   t | d t| jr| jnt| j}| j| jd d |d}| j| j|d}t| dk}|| jt	| j|d ||fS )Nz
linalg.eigr   r   r   r  )
r   rQ   r   r^   r   r   r   r   r   r   )r   r  r   vectorsis_cudar2   r2   r6   meta_linalg_eig  s   


r  rB  c                 C   s   | j jtjdddS )Nr   r  r   )mTr   rV   r   	transpose)rB  r2   r2   r6   cloneBatchedColumnMajor     r  r  c                 C      t | S r?   )r  )r   r  r  r2   r2   r6   _cholesky_solve_helper  s   r  c                    sP   t jdkfdd t  jdk fdd t d\}}t|||S )Nr   c                         d j  dS )Nz-b should have at least 2 dimensions, but has  dimensions insteadr   r2   r   r2   r6   re     r  z cholesky_solve.<locals>.<lambda>c                      r  )Nz-u should have at least 2 dimensions, but has r  r   r2   r  r2   r6   re     r  cholesky_solve)rV   rh   r   !_linalg_broadcast_batch_dims_namer  )r   r  r  self_broadcastedA_broadcastedr2   r  r6   r    s   

r  c                 C   s.   |   dkrtj| tjdS t| d t| S )Nr   r   cholesky)r   rV   r   legacy_contiguous_formatr   r  r   r  r2   r2   r6   r    s   
r  c                 C   s   t | d t| S )Ncholesky_inverse)r   r  r  r2   r2   r6   r    s   
r  check_errorsc                 C   sf   t | d t| d | j}t|}t|d}| |}||| | j|d|d  tjd}||fS )Nzlinalg.choleskyFr   r   r   )	r   r   r   r   r   r   r   rV   rX  )r  r  r	  A_shaper   	L_stridesr  infosr2   r2   r6   linalg_cholesky_ex  s   



r  tauc                    s  t jdkdd  t ddkdd  t ddkdd  t jj dkfd	d jdkr[jd d }jd d  t  |k fd
d t jjkfdd tdd t jjtjddjj	dS )Nr   c                   S   rk   )NzHtorch.linalg.householder_product: input must have at least 2 dimensions.r2   r2   r2   r2   r6   re     rm   z,linalg_householder_product.<locals>.<lambda>r  r   c                   S   rk   )Nzbtorch.linalg.householder_product: input.shape[-2] must be greater than or equal to input.shape[-1]r2   r2   r2   r2   r6   re     rm   c                   S   rk   )Nz`torch.linalg.householder_product: input.shape[-1] must be greater than or equal to tau.shape[-1]r2   r2   r2   r2   r6   re     rm   r1   c                         dj  d j  S )Nzptorch.linalg.householder_product: Expected tau to have one dimension less than input, but got tau.ndim equal to  and input.ndim is equal to r   r2   r   r  r2   r6   re     
   c                      r  )Nzltorch.linalg.householder_product: Expected batch dimensions of tau to be equal to input.shape[:-2], but got r2   r2   actual_batch_tau_shaper2   r6   re        c                      r  )Nz,torch.linalg.householder_product: tau dtype z does not match input dtype r   r2   r  r2   r6   re   &  s   
z torch.linalg.householder_productr  Fr  r   r   r^   r~   )
rV   rh   r   r   r   r^   r  empty_stridedr   r~   )r   r  expected_batch_tau_shaper2   )r  r   r  r6   linalg_householder_product   sD   


r  c                 C   s^   t | d t| ddd | | j}|| jt| jdd | j| jd d tjd}||fS )Nzlinalg.inv_exF)r  r  r  r   r   r   r   r   r   r   rV   rX  )r  r	  r  r  r2   r2   r6   linalg_inv_ex_meta6  s   
r  LDpivotsinfo)	hermitianr	  r  c                C   st   t | d t| d tj| jt| jdd| j| jd}| j| jd d tj	d}| j| jd d tj	d}|||fS )Nztorch.linalg.ldl_factor_exFr  r  r   r   r  )
r   r   rV   r  r   r   r^   r~   r   r   )r   r  r	  r  r  r  r2   r2   r6   linalg_ldl_factor_ex_metaB  s   


r   )r  c                   s   t d td t d t jdk fdd jd d }t|jkfdd ttj	fdd tj	 j	k fdd t
 \}}tj|t|d	d
 j	 jdS )Nztorch.linalg.ldl_solver   c                      r  )NzMtorch.linalg.ldl_solve: Expected B to have at least 2 dimensions, but it has r  r   r2   )r  r2   r6   re   e     z'linalg_ldl_solve_meta.<locals>.<lambda>r   c                      r  )Nzjtorch.linalg.ldl_solve: Expected LD.shape[:-1] and pivots.shape to be the same, but got pivots with shape  insteadr  r2   r  r2   r6   re   m  r!  c                      r   )Nz<torch.linalg.ldl_solve: Expected pivots to be integers. Got r   r2   r#  r2   r6   re   t  r   c                      r  )Nz!torch.linalg.ldl_solve: LD dtype z does not match b dtype r   r2   )r  r  r2   r6   re   x      Fr  r  )r   r   r  rV   rh   r   r   rQ   is_integer_dtyper^   _linalg_broadcast_batch_dimsr  r   r~   )r  r  r  r  expected_pivots_shapeB_broadcast_sizerT   r2   )r  r  r  r6   linalg_ldl_solve_metaW  s6   
	






r)  Pr  )pivotr+  c          	         s   t  jdk fdd t j}|d }|d }t||}||d< |r+ |}n dg}||d<  |}||d< ||d<  |}|||fS )Nr   c                      r  )Nz@linalg.lu: Expected tensor with 2 or more dimensions. Got size: r"  r  r2   r  r2   r6   re     r  z linalg_lu_meta.<locals>.<lambda>r  r   r   )rV   rh   r   r   r   r   r   )	r  r+  sizesrx  r  kr*  r  r  r2   r  r6   linalg_lu_meta  s$   





r.  LU)r+  r	  c          	         s   t  jdk fdd t j}|d }|d }t j|t|dd j jd}|	  t
|||d<  j|t jd	}|	   j|t jd	}|||fS )
Nr   c                      r  )NzFtorch.lu_factor: Expected tensor with 2 or more dimensions. Got size: r"  r  r2   r  r2   r6   re     r  z*linalg_lu_factor_ex_meta.<locals>.<lambda>r  r   Fr  r  r   )rV   rh   r   r   r   r  r   r^   r~   r  r   r   r   )	r  r+  r	  r,  rx  r  r/  r  r  r2   r  r6   linalg_lu_factor_ex_meta  s&   



r0  )r   adjointr1  c                   s   t d tj jk fdd tjtjkdd  td t |d tddkdd  tjd d jkfdd t	 \}}tj
|t|| d	 j jd
}| dkru|su| ru| }|S )Nztorch.linalg.lu_solvec                      r  )NzPlinalg.lu_solve: Expected LU and B to have the same dtype, but found LU of type  and B of type r"  r   r2   )r  r/  r2   r6   re     r  z&linalg_lu_solve_meta.<locals>.<lambda>c                   S   rk   )NzElinalg.lu_solve: pivots should be a Tensor of scalar type torch.int32r2   r2   r2   r2   r6   re     rm   zlinalg.lu_solver   c                   S   rk   )NzYlinalg.lu_solve: Number of pivots per batch should be same as the dimension of the matrixr2   r2   r2   r2   r6   re     rm   c                      r  )Nzclinalg.lu_solve: Expected LU.shape[:-1] and pivots.shape to be the same, but got pivots with shape r"  r  r2   r#  r2   r6   re     r!  r  r  r   )r   rV   rh   r^   r   r   r  r   r   r&  r  r   r~   r   r   conj)r/  r  r  r   r1  r(  rT   r  r2   )r  r/  r  r6   linalg_lu_solve_meta  s<   




r4  unpack_dataunpack_pivotsc                    s   t  jdk fdd |rt |jt jkdd  t j}|d }|d }t||}||d< |r9 |}n dg}|rX||d<  |}	||d< ||d<  |}
n dg}	 dg}
||	|
fS )Nr   c                      r  )NzFtorch.lu_unpack: Expected tensor with 2 or more dimensions. Got size: r"  r  r2   r/  r2   r6   re     r  z lu_unpack_meta.<locals>.<lambda>c                   S   rk   )Nztorch.lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype.
Note: this function is intended to be used with the output produced by torch.linalg.lu_factorr2   r2   r2   r2   r6   re        r  r   r   )	rV   rh   r   r^   rX  r   r   r   r   )r/  r  r5  r6  r,  rx  r  r-  r*  r  r  r2   r7  r6   lu_unpack_meta  s4   





r9  modec                    sd    dkrd}d}||fS  dkrd}d}||fS  dkr$d}d}||fS t d fdd ||fS )NreducedTcompleteFrc                         d  dS )Nzqr received unrecognized mode 'z=' but expected one of 'reduced' (default), 'r', or 'complete'r2   r2   r:  r2   r6   re   5  s   z _parse_qr_mode.<locals>.<lambda>rV   rh   )r:  	compute_qr;  r2   r?  r6   _parse_qr_mode(  s"   	
rB  QRr;  c                 C   s   t | d t| d t|\}}| jd }| jd }t||}|r>t| j}|r*|n||d< | |}||t|dd n| dg}t| j}	|sM|sO|n||	d< | |	}
|
|	t|	dd ||
fS )Nz	linalg.qrr  r   Fr  r   )	r  r   rB  r   r   r   r   r   r   )r  r:  rA  reduced_moderx  r  r-  Q_shaperC  R_shaperD  r2   r2   r6   linalg_qr_meta=  s"   








rH  sign	logabsdetc                 C   s   t | d t| dd | j}| |d d }| j|d d t| jd}tj|t|d| j| j	d}| j|d d tj
d}||||fS )Nzlinalg.slogdetFr  r   r  r   )r   r   r   r   r`   r^   rV   r  r   r~   rX  )r  r   rI  rJ  r/  r  r2   r2   r6   _linalg_slogdetY  s   
rK  full_matrices
compute_uvdriverc                 C   s   t | d t| d t| jd d }| jd }| jd }t||}|r^|||r+|n|g }| |}	|	|t|dd ||rC|n||g }
| |
}t	| dk}||
t|
|d n| dg}	| dg}| j||g t
| jd}|	||fS )	Nz
linalg.svdr  r   Fr  r   r   r   )r  r   r   r   rV   r   r   r   r   r   r`   r^   )r  rL  rM  rN  r   rx  r  r-  U_shaper  V_shapeVr  Sr2   r2   r6   _linalg_svd_metam  s$   






rS  arg1arg2c                 C   sn   | j d d }|j d d }t||}t|}|| d| dg7 }t|}||d|dg7 }||fS )Nr  r   )r   r$   r   r   )rT  rU  arg1_batch_sizesarg2_batch_sizesexpand_batch_portionarg1_expand_sizearg2_expand_sizer2   r2   r6   r&    s   
r&  c                 C   sV   |rt | || t| |\}}|| jkr| n| |}||jkr"|n||}||fS r?   )r  r&  r   rA  )rT  rU  r  rY  rZ  arg1_broadcastedarg2_broadcastedr2   r2   r6   r    s   r  r   c                 C   s6   | j d d }|jdkp| jd |jko|j |k}|S )Nr   r1   )r   r   )r   r   expected_batched_rhs_shapevector_caser2   r2   r6   linalg_solve_is_vector_rhs  s
   
r_  )r   r	  r  r/  r  r  c                   sh  t  d t jjk fdd t }|r dn}	t |	|d t|	 \}
}t|p6| dd  |rC|
d d n|
}tj|t	|| jj
d} j}tj|t	|d j j
d} j|d d tjd} j|d d	 tjd}||||f}||||f}td
d |D rt||D ]\}}t||j ||j|  t||dd q|S )Nzlinalg.solvec                         d j  dj  dS )NzKlinalg.solve: Expected A and B to have the same dtype, but found A of type r2  r"  r   r2   r  r  r2   r6   re     r  z"_linalg_solve_ex.<locals>.<lambda>r   c                   S   rk   )Nzlinalg.solve: Vector broadcasting of the left hand side is not supported for left=False. In this case linalg.solve is equivalent to B / A.squeeze(-1)r2   r2   r2   r2   r6   re     r8  r  Fr   r  c                 s   s    | ]}|d uV  qd S r?   r2   rK   r2   r2   r6   rq         z#_linalg_solve_ex.<locals>.<genexpr>)	copy_fromcopy_toexact_dtype)r   rV   rh   r^   r_  	unsqueezer  r&  r  r   r~   r   r   rX  r(  r)  r    r   r   r"   )r  r  r   r	  r  r/  r  r  r^  B_B_broad_shaperT   result_shaperesult_r   LU_pivots_info_r   resr=  or2   ra  r6   _linalg_solve_ex  sJ   



rp  )r   unitriangularr   rq  r   c          	      C   s   |d u r
|  dg}t|tstdt| t| ||d t|| d \}}|dd o2|	 }|r=t
||j}|S t||jrS||ddj |dd |S )Nr   zout must be TensorLike, got zlinalg.solve_triangularr  r   )r   rn   r   r   rv   r  r  r  rv  is_conjr    r   r!   r   
transpose_)	r  r  r  r   rq  r   rg  A_avoid_copy_Ar2   r2   r6   linalg_solve_triangular_meta  s   

rv  XM)re  r  c           	         s   t jdkfdd t  jdk fdd t d  jt jkrOt \}}t j|t|ddj	j
d}t j|t|dd j	 j
d}||fS  jt jks[ jt jkrjt }d	g}||fS t dd
d  ||fS )Nr   c                      r  )NzMtorch.triangular_solve: Expected b to have at least 2 dimensions, but it has r  r   r2   r   r2   r6   re   !  r!  z'triangular_solve_meta.<locals>.<lambda>c                      r  )NzMtorch.triangular_solve: Expected A to have at least 2 dimensions, but it has r  r   r2   r  r2   r6   re   (  r!  triangular_solveFr  r  r   c                   S   rk   )Nz+triangular_solve: Got an unexpected layout.r2   r2   r2   r2   r6   re   B  rm   )rV   rh   r   r  r}   stridedr&  r  r   r^   r~   
sparse_csr
sparse_bsrr   r   )	r   r  r  r  rq  self_broadcast_sizeA_broadcast_sizesolutioncloned_coefficientr2   r  r6   triangular_solve_meta  s<   	




r  c                 C   sp   t | d t| d | | jd d }| | j}|| jt| jdd | j| jd d tjd}|||fS )Nz
linalg.detr  Fr  r   r   r  )r  detr/  r  r2   r2   r6   _linalg_det_metaG  s   


r  c                    s  t jdkdd  t jdkdd  |rdndt j jd kfdd t j jd kfdd t jd jd kd	d  t jj d
kfdd t jjkfdd jdkrjd d }jd d t |kfdd jd d  t  |k fdd t jjkfdd t jjkfdd tdd tdd t jjtjddjjdS )Nr   c                   S   rk   )Nz3torch.ormqr: input must have at least 2 dimensions.r2   r2   r2   r2   r6   re   _  rm   zormqr.<locals>.<lambda>c                   S   rk   )Nz3torch.ormqr: other must have at least 2 dimensions.r2   r2   r2   r2   r6   re   b  rm   r  r   c                      r>  )Ntorch.ormqr: other.shape[z0] must be greater than or equal to tau.shape[-1]r2   r2   left_size_conditionr2   r6   re   h  r   c                      r>  )Nr  z"] must be equal to input.shape[-2]r2   r2   r  r2   r6   re   l  r   c                   S   rk   )NzHtorch.ormqr: tau.shape[-1] must be less than or equal to input.shape[-1]r2   r2   r2   r2   r6   re   q  rm   r1   c                      r  )Nz[torch.ormqr: Expected tau to have one dimension less than input, but got tau.ndim equal to r  r   r2   r  r2   r6   re   v  r  c                      r  )Nzhtorch.ormqr: Expected other to have the same number of dimensions as input, but got other.ndim equal to r  r   r2   r   r   r2   r6   re   }  r  c                      r  )NzWtorch.ormqr: Expected batch dimensions of tau to be equal to input.shape[:-2], but got r2   r2   r  r2   r6   re     r  c                      r  )NzYtorch.ormqr: Expected batch dimensions of other to be equal to input.shape[:-2], but got r2   r2   )actual_batch_other_shaper2   r6   re     r  c                         d j  dj  S )NzPtorch.ormqr: Expected input and tau to have the same dtype, but input has dtype z and tau has dtype r   r2   r  r2   r6   re     r  c                      r  )NzRtorch.ormqr: Expected input and other to have the same dtype, but input has dtype z and other has dtype r   r2   r  r2   r6   re     r  ztorch.ormqrr  r   Fr  r  )	rV   rh   r   r   r^   r  r  r   r~   )r   r  r   r   r  expected_batch_shaper2   )r  r  r   r  r   r  r6   ormqrU  sn   	







r  c                   s   t td  k fdd j}| d k}|}| }|r3td|D ]}|o0|dk}q&nt|D ]}|oA|dk}q7t |pH| fdd d S )Nr   c                      s   dd   dt  S )Nzpadding size is expected to be r   z, but got: r   r2   )r   paddingr2   r6   re     r   z,_padding_check_valid_input.<locals>.<lambda>r1   r   c                      s    d d  d d  dj  S )N	Expected r1   zD or r   zcD (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: r  r2   )r   r   r2   r6   re     s   )rV   rh   r   r   r   r   )r   r  r   	input_dimis_batch_modevalid_batch_modevalid_non_batch_moder   r2   )r   r   r  r6   _padding_check_valid_input  s$   r  c                   s   d}d d}j dkrd} d7  |d7 }t|dd |\|}   |rHtk o>k  fdd tdkfdd j dkra|fS ||fS )	Nr   r1   r/   r   c                         d d d  dj  S NzcArgument #4: Padding size should be less than the corresponding input dimension, but got: padding (rt   ) at dimension 
 of input r  r2   dim_wr   pad_lpad_rr2   r6   re        z_pad1d_common.<locals>.<lambda>c                      rr   )Nz
input (W: z%) is too small. Calculated output W: r2   r2   )input_woutput_wr2   r6   re     rf   r   )r   r   r  rV   rh   r   )r   r  is_reflection	dim_planenbatchnplaner2   )r  r   r  r  r  r  r6   _pad1d_common  s0   




r  c                 C      t | |ddS NTr  )r  r   r  r2   r2   r6   meta_reflection_pad1d     r  c                    *   t  jt jk fdd t |ddS )Nc                         d j   dS )Nz)"replication_pad1d" not implemented for ''r^   __str__r2   r   r2   r6   re         z(meta_replication_pad1d.<locals>.<lambda>Fr  )rV   rh   r^   boolr  r  r2   r  r6   meta_replication_pad1d  
   

r  c                   s   d |st t|dkdd  jdkr d7  |\ }|  |r=t |k o3|k  fdd t  k fdd jS )Nr1   r   c                   S   rk   )Nz padding size is expected to be 2r2   r2   r2   r2   r6   re   	  rm   z(_pad1d_backward_common.<locals>.<lambda>r/   c                      r  r  r  r2   r  r2   r6   re     r  c                         d d   S Nz(grad_output width unexpected. Expected: , Got: r   r2   r  grad_outputr  r2   r6   re     rP   )rV   rh   r   r   r   r   r   )r  r   r  r  r  r2   )r  r  r   r  r  r  r6   _pad1d_backward_common  s$   

r  
grad_inputc                 C      t | ||ddS r  r  r  r   r  r2   r2   r6   meta_reflection_pad1d_backward$     r  c                 C   r  )NFr  r  r  r2   r2   r6   meta_replication_pad1d_backward*  r  r  c                   s2  dd d}d}t |dd j}|dkr'd}d7  d7  |d7 }|\	
|} 
   	 |rptk oS	k 	fdd t
k ofk  
fdd tdkpydkfd	d jd
kr|fS ||fS )Nr   r1   r   r      c                      r  r  r  r2   r  r2   r6   re   J  r  z_pad2d_common.<locals>.<lambda>c                         d d d  dj  S NzcArgument #6: Padding size should be less than the corresponding input dimension, but got: padding (rt   r  r  r  r2   dim_hr   pad_bpad_tr2   r6   re   Q  r  c                      s   d  d d d S )Nz
input (H:  W: z%) is too small. Calculated output H: r2   r2   )input_hr  output_hr  r2   r6   re   Y  s
   r/   r  r   r   rV   rh   r   )r   r  r  
dim_slicesr  r   r  r2   )r  r  r   r  r  r  r  r  r  r  r  r6   _pad2d_common0  sB   




r  c                 C   r  r  )r  r  r2   r2   r6   meta_reflection_pad2de  r  r  c                    r  )Nc                      r  )Nz)"replication_pad2d" not implemented for 'r  r  r2   r  r2   r6   re   p  r  z(meta_replication_pad2d.<locals>.<lambda>Fr  )rV   rh   r^   r  r  r  r2   r  r6   meta_replication_pad2dk  r  r  c                 C   s   t |}t |}||fS r?   rV   r   )grad_wsaved_vsaved_gsaved_normsr   grad_vgrad_gr2   r2   r6   meta_weight_norm_backwardu  s   

r  c                    s   dd d}|j }| dkrd7  d7  |d7 }|\}}}}|  }	| }
|	| | |
| | tkfdd t k fdd ||j S )Nr   r1   r   r  c                      r  r  r   r2   r  r2   r6   re     rP   z%meta_pad2d_backward.<locals>.<lambda>c                      r  Nz)grad_output height unexpected. Expected: r  r   r2   r  r  r  r2   r6   re     rP   )r   r   rV   rh   r   r   )r  r   r  r  rd   r  r  r  r  r  r  r2   )r  r  r  r  r  r6   meta_pad2d_backward~  s,   
r  c             	      s  ddd d}t |dd jdk}|r+d}d7 d7  d7  |d7 }|\
|}    
   	|rtk odk fdd tk ow
k 
fd	d tk ok  fd
d t	dkpdkpdk	fdd |r||	fS |	fS )Nr/   r   r1   r   r      c                      r  r  r  r2   r  r2   r6   re     r  z_pad3d_common.<locals>.<lambda>c                      r  r  r  r2   r  r2   r6   re     r  c                      r  )NzcArgument #8: Padding size should be less than the corresponding input dimension, but got: padding (rt   r  r  r  r2   )dim_dr   pad_bkpad_fr2   r6   re     r  c                      s(   d  d d d d d S )Nz
input (D:  H: r  z%) is too small. Calculated output D: r2   r2   )input_dr  r  output_dr  r  r2   r6   re     s   r  )r   r  r  r  
batch_moder  r  r2   )r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r6   _pad3d_common  sP   





r  c                 C   r  r  )r  r  r2   r2   r6   meta_reflection_pad3d  r  r  c                    r  )Nc                      r  )Nz)"replication_pad3d" not implemented for 'r  r  r2   r  r2   r6   re     r  z(meta_replication_pad3d.<locals>.<lambda>Fr  )rV   rh   r^   r  r  r  r2   r  r6   meta_replication_pad3d  r  r  c                    sH  t t|dkdd  |jdkrtd|j j|jkr+tdj d|j ddd	 |jd
krBd	7 d	7  d	7  |\}}}}}}| }	|}
|}|	| | |
| | || | t kfdd t kfdd t  k fdd ||jS )N   c                   S   rk   )Nz padding size is expected to be 6r2   r2   r2   r2   r6   re     rm   z%meta_pad3d_backward.<locals>.<lambda>r/   zinput.ndim must be > 3, got z,grad_output.ndim must equal input.ndim, got  != r   r1   r  c                      r  r  r   r2   r  r2   r6   re   	  rP   c                      r  r  r   r2   r  r2   r6   re   	  rP   c                      r  )Nz(grad_output depth unexpected. Expected: r  r   r2   )r  r  r  r2   r6   re   !	  rP   )rV   rh   r   r   r   r   r   r   )r  r   r  r  r  r  r  r  r  r  r  r  r2   )r  r  r  r  r  r  r  r6   meta_pad3d_backward  sD   





r  r   pc                 C   s^   t |  dd  | d}|dkr| dgjt jdS | ||d  d fjt jdS )Nc                   S   rk   )Nz(_pdist_forward requires contiguous inputr2   r2   r2   r2   r6   re   +	  rm   z%meta__pdist_forward.<locals>.<lambda>r   r1   r   r   )rV   rh   rv  r   r   r?  r  )r   r  r  r2   r2   r6   meta__pdist_forward'	  s   
r  gradpdistc                 C   s8   t | dd  t | dd  t j|t jdS )Nc                   S   rk   )Nz._pdist_backward requires self to be contiguousr2   r2   r2   r2   r6   re   :	  rm   z&meta__pdist_backward.<locals>.<lambda>c                   S   rk   )Nz/_pdist_backward requires pdist to be contiguousr2   r2   r2   r2   r6   re   =	  rm   r   )rV   rh   rv  r   r  )r  r   r  r  r2   r2   r6   meta__pdist_backward6	  s   r  )rj  ri  c                   s  ddl m}m}  d} d}d}	|t|j|||	fr-|||	ft 	 dkdd  t	 dkdd  t
jsatj j  koVjkn   fd	d  j}
j|
d |
d td ko|d kfd
d  S )Nr   )guard_or_truesym_eqr1   r   r/   c                   S   rk   Nzbatch1 must be a 3D tensorr2   r2   r2   r2   r6   re   L	  rm   zmeta_baddbmm.<locals>.<lambda>c                   S   rk   Nzbatch2 must be a 3D tensorr2   r2   r2   r2   r6   re   M	  rm   c                      s   dj  d j  dj  S )Nz+Input dtypes must be the same, got: input: z
, batch1: z
, batch2: r   r2   )batch1batch2r   r2   r6   re   Q	      c                	      &   d d d d  d d  d	S Nz@Expected size for first two dimensions of batch2 tensor to be: [rt   z] but got: [r   r1   ].r2   r2   batch2_sizesbscontraction_sizer2   r6   re   Y	  s   )r<  r  r  r   rV   sym_notr   rA  rh   r   
exp_config&skip_dtype_check_in_meta_registrationsr^   r   )r   r  r  rj  ri  r  r  dim1dim2dim3batch1_sizesr2   )r  r  r  r  r  r   r6   meta_baddbmmB	  s,   


r  c                C   r~  r   r  r   r   r2   r2   r6   meta_bernoullia	  s   r        ?c                 C   r  r?   r2   r   r  r   r2   r2   r6   meta_bernoulli_h	  r  r  c                 C   r~  r   r  r  r2   r2   r6   meta_bernoulli_pm	  r  r	  c                 C   
   t | S r?   r  r  r2   r2   r6   meta_poissons	  r  r  c                 C   s6   t |
|  k dd  t j| t jd}t | |fS )Nc                   S   rk   )NzJError in fused_moving_avg_obs_fake_quant_cpu: ch_axis must be < self.dim()r2   r2   r2   r2   r6   re   	  rm   z6meta__fused_moving_avg_obs_fq_helper.<locals>.<lambda>r   )rV   rh   r   r   r  )r   observer_onfake_quant_onrunning_minrunning_maxscale
zero_pointaveraging_const	quant_min	quant_maxch_axisper_row_fake_quantsymmetric_quantmaskr2   r2   r6   $meta__fused_moving_avg_obs_fq_helpery	  s   
r  c                    s   t |  dkdd  t | dkdd  | j\ |j\t  k fdd |d urNt || jkpI|t jkoI| jt jt jfv dd  |d u rU| jn|}| jf|dS )Nr   c                   S   rk   )Nza must be 2Dr2   r2   r2   r2   r6   re   	  rm   zmeta_mm.<locals>.<lambda>c                   S   rk   )Nzb must be 2Dr2   r2   r2   r2   r6   re   	  rm   c                	      s   d d  d d d	S )Nz/a and b must have same reduction dim, but got [rt   z] X [r  r2   r2   M1M2Nr*  r2   r6   re   	  s    c                   S   rk   )NzFout_dtype must be the same as input dtype or fp32 for fp16/bf16 inputsr2   r2   r2   r2   r6   re   	  rm   r   )	rV   rh   r   r   r^   rr  rs  rt  r   )r4   r5   rS  rN   r2   r  r6   meta_mm	  s"   



r  c                    s0   |rt  fddtjD S tj S )Nc                 3   s&    | ]}| vrj | nd V  qdS r"  r  rL   r   dimsr   r2   r6   rq   	  s   $ z+_compute_reduction_shape.<locals>.<genexpr>)rg   r   r   rQ   compute_reduction_output_shaper   )r   r!  r  r2   r   r6   r  	  s   r  strc                 C   sD   t | tjjr| jjS t| dr t| jdr | jjdkr | jjS dS )Nr~   rv   r{   r   )rn   rV   _subclasses
FakeTensorfake_devicerv   hasattrr~   )rG  r2   r2   r6   r   	  s   
r   input_tensorr   r  dilationis_transposedgroupsoutput_paddingc                    s  dt dt dt dt dt dt fdd}dt dt dt dt dt d	t dt fd
d}	|jdd  }
| jdd   |r<||jd  }n|jd }t|jd | | jd kdd  | jd |gt|trg|gt  }nt|dkrv|d gt  }t|tr|gt  }nt|dkr|d gt  }t|tr|gt  }nt|dkr|d gt  }d }|rt|tr|gt  }nt|dkr|d gt  }n|}tt D ]2}|r|	 | || || |
| || ||  q׈| | || || |
| ||  qddl	m
} ddlm} t| |r| jn| j}|jdko-tjjd u }|sGt|dd dd  D   fdd S )Nlnr  r   r-  r  r>   c                 S   s$   | d|  ||d   d | d S )a  
        Formula to apply to calculate the length of some dimension of the output

        See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html

        Args:
            ln: length of the dimension
            p: padding in that dim
            d: dilation in that dim
            k: kernel size in that dim
            s: stride in that dim
        Returns:
            The output length
        r   r1   r2   )r-  r  r   r-  r  r2   r2   r6   _formula	  s   $z+calc_conv_nd_return_shape.<locals>._formularA   c                 S   s(   | d | d|  ||d   | d S )a  
        Formula to apply to calculate the length of some dimension of the output
        if transposed convolution is used.
        See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html

        Args:
            ln: length of the dimension
            p: padding in that dim
            d: dilation in that dim
            k: kernel size in that dim
            s: stride in that dim
            op: output padding in that dim

        Returns:
            The output length
        r1   r   r2   )r-  r  r   r-  r  rA   r2   r2   r6   _formula_transposed	  s   (z6calc_conv_nd_return_shape.<locals>._formula_transposedr   r1   r   c                   S   rk   )NzInvalid channel dimensionsr2   r2   r2   r2   r6   re   	  rm   z+calc_conv_nd_return_shape.<locals>.<lambda>)r%  )sym_orr   c                 S   s   g | ]}|d kqS r   r2   rK   r2   r2   r6   rO   ?
  r$  z-calc_conv_nd_return_shape.<locals>.<listcomp>c                      s   dt   ddd   dS )NzGiven input size per channel: z&. Calculated output size per channel: r   z. Output size is too small)r   r2   r!  	ret_shaper2   r6   re   @
  s    
)r   r   rV   rh   rn   r   r   r   appendtorch._subclasses.fake_tensorr%  r<  r0  r&  r~   rv   versionhip)r(  rO  r   r  r)  r*  r+  r,  r.  r/  kernel_sizeout_channelsoutput_padding_listr   r%  r0  r~   is_cudnnr2   r1  r6   calc_conv_nd_return_shape	  sv   "
&




"	
r;  c                 C      t j| t jkS r?   rV   _prims_commonr   channels_lasttenr2   r2   r6   is_channels_lastH
     rB  running_meanrunning_vartrainingexponential_average_factorepsilonc                    s    j }|d ur
|j n|j }	|d ur|j n|j }
 fdd} |j| d}|r4 |	} |
}n
 d} d}|||fS )Nc                      s(   t  rtjS  jtjdrtjS tjS r   )rB  rV   r?  rv  r   r2   r(  r2   r6   pick_memory_format_
  s
   z2meta_miopen_batch_norm.<locals>.pick_memory_formatr   r   )r   r   r?  )r(  rO  rQ  rD  rE  rF  rG  rH  r   save_mean_shapesave_var_shaperJ  r   	save_meansave_varr2   rI  r6   meta_miopen_batch_normL
  s   



rO  c	              	   C   sZ   t | |||||||r|nd }	ddlm}
 d}d}|
| |dkr&d|	|< | |	}|S )Nr   guard_or_falser1   )r;  r<  rQ  r   r   )r(  rO  rQ  r   r  r)  r*  r,  r+  	shape_outrQ  input_channels_dimoutput_channels_dimr   r2   r2   r6   	meta_convr
  s"   

rU  mkldnnc
              	   C   sH   t | ||||d|g }
| |
}tj}|  dkrtj}|j|d}|S )NFr  r   )r;  r   rV   r?  r   channels_last_3dr?  )r(  rO  rQ  r  r   r)  r+  attrscalars	algorithmrR  r   out_memory_formatr2   r2   r6   meta_mkldnn_convolution_default
  s   
r\  c                 C   s$   |  g | jd d |jd R S Nr   r   r   r   )r(  rO  rQ  rX  rY  rZ  r2   r2   r6   meta_linear_pointwise_default
  s   $r_  r   c                 C   s$   |  g | jd d |jd R S r]  r^  )r(  packed_weightorig_weightrQ  r   r2   r2   r6   meta_mkl_linear
  s   rb  onednnc              	   C   s   t | ||||	d|
d }|d u r| j}|tjtjtjtjtjfvr'td| | j	||d}t
|dvr>tdt
| dtjtjtjdt
| }|j|d}|S )NFOoutput_dtype must be one of float32, bfloat16, uint8, int8, float8_e4m3fn, got r   )r/   r  r  z3Expect output to be 3d/4d/5d for conv1d/2d/3d, got r   r   )r;  r^   rV   rr  rt  uint8rW  ru  r   r   r   r   r?  rW  r?  )r;   x_scalex_zpww_scalew_zprQ  r   r  r)  r+  output_scaleoutput_zero_pointoutput_dtyperX  rY  rZ  rR  r   formatr2   r2   r6   meta_qconv_pointwise
  sF   
ro  c                 C   s   |dkrt d| d|S )Nsumz#binary_op_name must be 'sum', got 'r  r   )r;   rf  rg  rh  ri  rj  accumrQ  r   r  r)  r+  rk  rl  rm  accum_scaleaccum_zero_pointbinary_op_nameri  unary_op_nameunary_op_argsunary_op_algorithmr2   r2   r6   meta_qconv2d_pointwise_binary  s
   
ry  c                 C   sT   t | j}|jd |d< |	tjtjtjtjtjfvr!td|	 | j	||	d}|S )Nr1   r   zOoutput_dtype must be one of float32, bfloat16, int8, uint8, float8_e4m3fn, got r   )
r   r   rV   rr  rt  rW  re  ru  r   r   )r;   rf  rg  rh  ri  rj  rQ  rk  rl  rm  post_op_namepost_op_argspost_op_algorithmry  r   r2   r2   r6   meta_qlinear_pointwise#  s   
r}  c                 C   s`   |dkr|S t | j}|jd |d< |
tjtjtjtjtjfvr'td|
 | j	||
d}|S )Nrp  r1   r   rd  r   )
r   r   rV   rr  rt  re  rW  ru  r   r   )r;   rf  rg  rh  ri  rj  x_2rQ  rk  rl  rm  x2_scalex2_zpru  ri  rv  rw  rx  ry  r   r2   r2   r6   meta_qlinear_pointwise_binaryD  s    
r  c                 C   s&   t | j}|jd |d< | |}|S )Nr1   r   )r   r   r   )r;   rh  rQ  ry  r   r2   r2   r6   meta_linear_dynamic_fp16l  s   

r  	quantizedr2   r   r1   c                 C   sr   t | |||||\}}}|  dkr| dnd}	tj}
|  dkr(|||g}n|	|||g}tj|| j| j|
dS Nr  r1   r/   r  )#max_pool2d_checks_and_compute_shaper   r   rV   r?  r   r^   r~   r   r7  r   r  r)  	ceil_modenInputPlaneoutputHeightoutputWidthr  r   r   r2   r2   r6   meta_quantized_max_pool2d}  s$   r  c                    s   t  dkfdd t  dkfdd t jt jt jt jfv fdd t jt jkfdd t  jt jk fdd t jjkfdd j	
d	
d	jd
S )Nr   c                         d    dS )Nzx must be a 2D tensor, got ra  r   r2   r   r2   r6   re     rf   z/meta_int4mm_packed_weight_cpu.<locals>.<lambda>c                      r  )Nzw must be a 2D tensor, got ra  r   r2   rh  r2   r6   re     rf   c                      r   Nz#expected x to be f32/f16/bf16, got r   r2   r   r2   r6   re     r   c                      r   Nzexpected w to be uint8, got r   r2   r  r2   r6   re     r   c                      r   )Nz q_group_size must be int64, got r   r2   )q_group_sizer2   r6   re     r   c                      r   )Nz5q_scale_and_zeros must have the same dtype as x, got r   r2   )q_scale_and_zerosr2   r6   re     r   r   r   )rV   rh   r   r^   rr  rs  rt  re  r   r   r   r;   rh  r  r  r2   )r  r  rh  r;   r6   meta_int4mm_packed_weight_cpu  s$   




r  c                    s4   t   koj k fdd d S )Nc                      s8   d  d d dd   d dj   S )NzExpected a tensor of dimension z and tensor.size[z] == rt   zbut got : dimension z] = r   r   r2   r   dim_sizer   rG  r2   r6   re     s    z check_dim_size.<locals>.<lambda>)rV   rh   r   r   )rG  r   r  r   r2   r  r6   check_dim_size  s   r  r  r  r^   c                 C   r
  r?   r  )r   r  r  r^   r2   r2   r6   meta_quantize_per_tensor  s   
r  c                    s  dd }|d|\}}	t t|dv dd  t  jt jt jt jt jfv fdd t|dkr8||	}
}nt|d	krH|d |d }
}n|d
|\}
}|d|\}}t |d u p_|dkdd    dkro 	dnd	} 	d} 	d} 	d}t
||||
d	|}t
||	||d	|}t }t ||	|
|||d	d	||||||   dkr|||g}n||||g}t j| j j|dS )Nc                    D   t t|dv  fdd |d }t|dkr|n|d }||fS )Nr1   r   c                      r>  )Nzavg_pool2d: 4 must either be a single int, or a tuple of two intsr2   r2   r  r2   r6   re     r   z1meta_avg_pool2d.<locals>.unpack.<locals>.<lambda>r   r1   rV   rh   r   r  r  HWr2   r  r6   unpack     

zmeta_avg_pool2d.<locals>.unpackr7  r   r1   r   c                   S   rk   NzOavg_pool2d: stride must either be omitted, a single int, or a tuple of two intsr2   r2   r2   r2   r6   re     rm   z!meta_avg_pool2d.<locals>.<lambda>c                      r  )Nz""avg_pool2d" not implemented for 'r  r  r2   r  r2   r6   re     r  r   r1   r   r  c                   S   rk   Nzdivisor must be not zeror2   r2   r2   r2   r6   re     rm   r  r  r  r   r/   r  )rV   rh   r   r^   re  uint16uint32r  r   r   pooling_output_shaperQ   r   pool2d_shape_checkr   r~   )r   r7  r   r  r  count_include_paddivisor_overrider  kHkWdHdWpadHpadWr  r  inputHeight
inputWidthr  r  r   r   r2   r  r6   meta_avg_pool2d  sj   
	





r  c                 C   sj   t | ||||||dd|	|
|||| |  }|	}t|||d | t|||d | t|||d | d S )Nr1   r/   r   )r  r   r  )r   
gradOutputr  r  r  r  r  r  r  r  r  r  r  r  
mem_formatr   nOutputPlaner2   r2   r6   avg_pool2d_backward_shape_check  s,   r  c                 C   s  t t|dkpt|dkdd  |d }t|dkr|n|d }	t t|dkp5t|dkp5t|dkdd  t|dkrB|n|d }
t|dkrN|	nt|dkrV|
n|d }t t|dkpgt|dkdd  |d }t|dkrx|n|d }t |d u p|dkdd  |j}| d	kr|d
 nd}|d }|d }|d }t||||
d|}t||	||d|}t|}t|| |||	|
||||||||| t j	||j
|j|dS )Nr1   r   c                   S   rk   )NzKavg_pool2d: kernel_size must either be a single int, or a tuple of two intsr2   r2   r2   r2   r6   re   L  rm   z*meta_avg_pool2d_backward.<locals>.<lambda>r   c                   S   rk   r  r2   r2   r2   r2   r6   re   R  rm   c                   S   rk   )NzGavg_pool2d: padding must either be a single int, or a tuple of two intsr2   r2   r2   r2   r6   re   X  rm   c                   S   rk   r  r2   r2   r2   r2   r6   re   _  rm   r  r  r  r  r   r  )rV   rh   r   r   r   r  rQ   r   r  r   r^   r~   )gradOutput_r   r7  r   r  r  r  r  r  r  r  r  r  r  
input_sizer  r  r  r  r  r  r  r2   r2   r6   meta_avg_pool2d_backward>  sj   "(
r  c                    s6  t t|dv dd  |d }t|dkr|n|d }t|dkr$|n|d }	t | p2t|dv dd  t  jt jt jt jt jfv fdd |sP|n|d }
|sX|nt|dkr`|
n|d }|sh|	nt|dkrp|
n|d }t t|dv d	d  |d }t|dkr|n|d }t|dkr|n|d }t  jd
v dd  t | p|dkdd   	d} 	d} 	d} 	d} 	d}t
||||
d|}t
||||d|}t
||	||d|}t ||||	|
|||||ddd||||||ddd  jdkr ||||fS  |||||fS )Nr1   r/   c                   S   rk   NzFavg_pool3d: kernel_size must be a single int, or a tuple of three intsr2   r2   r2   r2   r6   re     rm   z!meta_avg_pool3d.<locals>.<lambda>r   r1   r   c                   S   rk   NzJavg_pool3d: stride must be omitted, a single int, or a tuple of three intsr2   r2   r2   r2   r6   re     rm   c                      r  )Nz""avg_pool3d" not implemented for 'r  r  r2   r  r2   r6   re     r  c                   S   rk   NzBavg_pool3d: padding must be a single int, or a tuple of three intsr2   r2   r2   r2   r6   re     rm   r  r  c                   S   rk   Nz9non-empty 4D or 5D (batch mode) tensor expected for inputr2   r2   r2   r2   r6   re     rm   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   r  r  r  r   zavg_pool3d()T)check_input_sizer  )rV   rh   r   r^   re  r  r  r  r   r   r  pool3d_shape_checkr   )r   r7  r   r  r  r  r  kTr  r  dTr  r  padTr  r  r  nslicesitimeiheightiwidthotimeoheightowidthr2   r  r6   meta_avg_pool3d  s   

  





r  c                 C   s  t t|dv dd  |d }t|dkr|n|d }	t|dkr$|n|d }
t | p2t|dv dd  |s;|n|d }|sC|	nt|dkrK|n|d }|sS|
nt|dkr[|n|d }t t|dv dd  |d }t|dkrw|n|d }t|dkr|n|d }t |jd	v d
d  t | p|dkdd  |d}|d}|d}|d}t||||d|}t||	||d|}t||
||d|}t|| |||	|
||||||||||||d ||jS )Nr  c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   z*meta_avg_pool3d_backward.<locals>.<lambda>r   r1   r   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   r  c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   r  r  r  r   zavg_pool3d_backward())	rV   rh   r   r   r   r  avg_pool3d_backward_shape_checkr   r   )r  r   r7  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  otime_for_shape_checkoheight_for_shape_checkowidth_for_shape_checkr2   r2   r6   meta_avg_pool3d_backward  st   
  




r  c                    sZ   t  jdkp jdk fdd  jd d t| }t }t j| j j	|dS )Nr/   r  c                      r   )Nz"Expected 3D or 4D tensor, but got r  r2   r   r2   r6   re   5  r   z*meta_adaptive_avg_pool2d.<locals>.<lambda>r  r  )
rV   rh   r   r   rg   rQ   r   r   r^   r~   )r   output_sizery  r   r2   r   r6   meta_adaptive_avg_pool2d1  s   

r  c                    s@   t  jdkp jdk fdd   jd d t| S )Nr  r  c                      r   )Nz"Expected 4D or 5D tensor, but got r  r2   r   r2   r6   re   G  r   z*meta_adaptive_avg_pool3d.<locals>.<lambda>r  )rV   rh   r   r   r   rg   )r   r  r2   r   r6   meta_adaptive_avg_pool3dC  s
   
r  c                    s    j }td|D ]t dk fdd qt|dkp$|dkfdd tj jk fdd tj}trDtj}	j
j|d	S )
Nr1   r   c                      s   d j  d dS )Nz{adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero                       size for non-batch dimensions,  with dimension  being emptyr  r2   )grad_outr   r2   r6   re   R  s
    z4meta__adaptive_avg_pool2d_backward.<locals>.<lambda>r/   r  c                      r   )NzBadaptive_avg_pool2d_backward(): Expected 3D or 4D tensor, but got r  r2   r   r2   r6   re   W  r   c                      r  Nexpected dtype z! for `grad_output` but got dtype r   r2   )r  r   r2   r6   re   [  r$  r   )r   r   rV   rh   r   r^   r   rB  r?  r   r   r?  )r  r   r   r   r2   )r  r   r   r6   "meta__adaptive_avg_pool2d_backwardL  s$   

r  c                 C   s   t | d tj|tjdS )Nadaptive_avg_pool3d_backwardr   )!_adaptive_pool_empty_output_checkrV   r   r  r  r   r2   r2   r6   "meta__adaptive_avg_pool3d_backwardc  s   
r  r  c                    s<   j }td|D ]tdk fdd qd S )Nr1   r   c                      s     dj  d dS )Nzc(): Expected grad_output to have non-zero size for non-batch dimensions, but grad_output has sizes r  r  r  r2   r  r  r   r2   r6   re   o  s
   z3_adaptive_pool_empty_output_check.<locals>.<lambda>)r   r   rV   rh   r   )r  r  r   r2   r  r6   r  j  s   r  c                    s"  j }t|dv fdd td|D ] t dk fdd qtt|dkdd  d}d}d}j d	krGd}|d7 }|d }|\}}j d
krm|||f}|}	j|tjd}
|	|
fS ||||f}t	}|j
|d}	j|tjdj
|d}
|	|
fS )Nr/   r  c                      r   )Nz:adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: r  r2   r  r2   r6   re   |  r   z*meta_adaptive_max_pool2d.<locals>.<lambda>r1   r   c                         dj  d  dS )Nzjadaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, but input has sizes r  r  r  r2   r   r   r2   r6   re     
   r   c                   S   rk   )NzCadaptive_max_pool2d(): internal error: output_size.size() must be 2r2   r2   r2   r2   r6   re     rm   r  r/   r   r   )r   rV   rh   r   r   r   r   r   rQ   r   r?  )r   r  r   dimHsizeBsizeDosizeHosizeWr   r   r   r   r2   r  r6   meta_adaptive_max_pool2dv  sD   







r  c                    sd    j }t|dv  fdd t d tj jk fdd t}jj	|dS )Nr  c                      r   )NzKadaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: r  r2   r  r2   r6   re     r   z3meta_adaptive_max_pool2d_backward.<locals>.<lambda>adaptive_max_pool2d_backwardc                      r  r  r   r2   r  r   r2   r6   re     r$  r   )
r   rV   rh   r  r^   rQ   r   r   r   r?  )r  r   r   r   r   r2   r  r6   !meta_adaptive_max_pool2d_backward  s   



r  c                    s   j }t|dv fdd td|D ] t dk fdd qtt|dkdd  d}d}d}|d	krFd}|d7 }|}|\}}}|d
kr[||||f}	n|||||f}	|	}
j|	tjd}|
|fS )Nr  c                      r   )Nz:adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: r  r2   r  r2   r6   re     r   z*meta_adaptive_max_pool3d.<locals>.<lambda>r1   r   c                      r  )Nzjadaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, but input has sizes r  r  r  r2   r  r2   r6   re     r  r/   c                   S   rk   )NzCadaptive_max_pool3d(): internal error: output_size.size() must be 3r2   r2   r2   r2   r6   re     rm   r  r  r   )r   rV   rh   r   r   r   r   r   )r   r  r   dimDr  r  osizeTr  r  r   r   r   r2   r  r6   meta_adaptive_max_pool3d  s8   





r  c                 C   s   t | d ||jS )Nadaptive_max_pool3d_backward)r  r   r   )r  r   r   r2   r2   r6   !meta_adaptive_max_pool3d_backward  s   
r  c                 C   s   |d u rt d| |S )Nz:cannot repeat_interleave a meta tensor without output_size)r>  r   )repeatsr  r2   r2   r6   meta_repeat_interleave_Tensor  s   
r  c                 C   s\   | j jstd| j  |j jstd|j  t| t| j |t|j tjd}|S )Nz!real must be floating point, got z!imag must be floating point, got rI   )r^   r   r   rU   r?  r   r   rR   )realimagr  r2   r2   r6   meta_complex  s   r  )
fill_valuer  c                C   sF   t | dv r| j||  ftjdS tj||  fd|ftj| jdS )N)r   mpsr   r   r1   r^   r~   )r   r   r   rV   r   r  r~   )r   r   r  r2   r2   r6   nonzero_static  s   
r  c                 C   s<   t tjdd  t j|  |  fd|  ft j| jdS )Nc                   S   rk   )NaY  The register_meta function for torch.nonzero() raises unimplemented by default, as a correct data-independent implementation does not exist. This implementation returns a fake value, assuming all elements of the tensor are non-zero. To enable this registration, please set 'torch.fx.experimental._config.meta_nonzero_assume_all_nonzero' to True.r2   r2   r2   r2   r6   re     rm   znonzero.<locals>.<lambda>r1   r   )	rV   _check_not_implementedr  meta_nonzero_assume_all_nonzeror  r   r   r   r~   r   r2   r2   r6   nonzero  s   
r  c              
      sD  t tdd  g }tD ]q\d ur|t jt jt jt jt jfv dd  jt jt jfv rv }t	|t 
j jkfdd tjD ]#t 
j j  kfdd ||d qQq| q| q|t t	jkfdd dd lm} t|j t	jk rd  t	jk sd}d	}D ]|dkrǈd urd}q|dkr҈d u rd
}qd ur nqd}|sg }g }tD ]\d ur| | qtD ]\d u r| | q||g g  g tD ]&\}	d u rBr8 j|	  q"j|	  q"tjq" fdd}
   }ddlm} | dkrk|S |
}t|\}}t|ttt	|krt|j|}t|}t|t|}|| |}|S )Nc                   S   rk   )Nz#at least one index must be providedr2   r2   r2   r2   r6   re   '  rm   z#meta_index_Tensor.<locals>.<lambda>c                   S   rk   )Nz?tensors used as indices must be long, int, byte or bool tensorsr2   r2   r2   r2   r6   re   /  rm   c                      r   )N)too many indices for tensor of dimension r   r2   r   r2   r6   re   6  r   c                	      s$   dj  d  dj  d  S )NzThe shape of the mask 
 at index z0 does not match the shape of the indexed tensor r  r2   )r   r   jr-  r   r2   r6   re   ;  s
    r1   c                      s   dj  dt  dS )Nr  z (got ru   )r   r   r2   )r   r   r2   r6   re   F  r   r   Fr   Tc                    sL      }t |  }dgt |tt| jt  < | ||S )zI
        This follows restride_src in TensorAdvancedIndexing.cpp
        r   )r   r   r   r   
as_strided)r   r   r   )after_shapebefore_shapereplacement_shaper2   r6   _restride_src  s    z(meta_index_Tensor.<locals>._restride_srcrP  ) rV   rh   r  	enumerater^   r   r   rW  r  r   r   r   r   r   r3  selecttorch._refsr@  r   r%   r   r   r<  rQ  r   rQ   3compute_elementwise_output_logical_to_physical_perm
apply_permr   invert_permr  r   )r   r   r  r  refsstatehas_contiguous_subspacer!  transposed_indicesr   r  r   rQ  restrided_selfpermrT   
perm_shaperJ  r2   )	r	  r
  r   r   r   r  r-  r  r   r6   meta_index_Tensor%  s   









r  c                 C   sv   d }d }d }dd }|||}|
d r|  | j|d}|
d r-|  | j|d}|
d r6|  |}|||fS )Nc                 S   sJ   t | }t |}|tjks|tjkrtjS |tjks|tjkr"tjS tjS r?   )r   rV   r?  rW  r   )t1t2fmt1fmt2r2   r2   r6   _conv_memory_format  s   z6meta_convolution_backward.<locals>._conv_memory_formatr   r   r1   r   )r   r   r?  )grad_output_input_weight_bias_sizes_optr   r  r)  
transposedr,  r+  output_maskbackend_grad_inputbackend_grad_weightbackend_grad_biasr  r   r2   r2   r6   meta_convolution_backward  s    
	

r)  c                   s     d} d}| ||f} t  dkdd  t dkdd  t  d dk fdd t  d dk fd	d t|  d|ko^|  d|kd
d  | |   S )Nr1   r   r/   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   zmeta_addbmm.<locals>.<lambda>c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   r   c                         d  d d d S )Nz8batch1 and batch2 must have same number of batches, got r   r   r   r2   r  r  r2   r6   re     r  c                
      6   d  d d  d d d d d d	S )Nz#Incompatible matrix sizes for bmm (r1   r;   r   r   ru   r   r2   r+  r2   r6   re     
   c                   S   rk   )Nz.self tensor does not match matmul output shaper2   r2   r2   r2   r6   re     rm   )r   rA  rV   rh   r   r   )r   r  r  rj  ri  r  r   r2   r+  r6   meta_addbmm  s$   

r.  c                 K   s   |  |  S r?   r   r   )r   r  kwargsr2   r2   r6   meta_randint_like  s   r1  )
grad_scale	found_infc       	            s4   | |||||fD ] t t t fdd qd S )Nc                         dt   S Nz'exponent must be a tensor list but got rv   r2   lr2   r6   re     r  z#meta__fused_adam_.<locals>.<lambda>rV   rh   rn   r   )r   gradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepslrbeta1beta2weight_decayepsamsgradmaximizer2  r3  r2   r7  r6   meta__fused_adam_  s   
rF  c       	            sZ   | |||||fD ] t t t fdd qdd }|| ||||||||fS )Nc                      r4  r5  r6  r2   r7  r2   r6   re   +  r  z"meta__fused_adam.<locals>.<lambda>c                 S   s   dd | D S )Nc                 S   s   g | ]}t |qS r2   r  )rL   r  r2   r2   r6   rO   /  rP   z=meta__fused_adam.<locals>.empty_like_list.<locals>.<listcomp>r2   )tensor_listr2   r2   r6   empty_like_list.  s   z)meta__fused_adam.<locals>.empty_like_listr9  )r   r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  r2  r3  rH  r2   r7  r6   meta__fused_adam  s   
rI  c                    s   t   dkdd  t  dkdd  t  jt jt jfv  fdd t jt ju fdd t  ddk fd	d  j ddft jd
S )Nr   c                   S   rk   )Nza must be a 2D tensorr2   r2   r2   r2   r6   re   =  rm   zmeta__int_mm.<locals>.<lambda>c                   S   rk   )Nzb must be a 2D tensorr2   r2   r2   r2   r6   re   >  rm   c                      r   )Nz'expected self to be int8 or uint8, got r   r2   )r4   r2   r6   re   A  r   c                      r   )Nzexpected mat2 to be int8, got r   r2   )r5   r2   r6   re   E  r   r1   r   c                
      r,  )Nz'Incompatible matrix sizes for _int_mm (r   r;   r1   r   ru   r   r2   r3   r2   r6   re   I  r-  r   )	rV   rh   r   r^   rW  re  r   r   rX  r3   r2   r3   r6   meta__int_mm:  s   


 rJ  c                    st   t   dkdd  t  jt ju  fdd  d} dd } j|d ||d  d	|d ft jd
S )Nr   c                   S   rk   Nzw must be a 2D tensorr2   r2   r2   r2   r6   re   S  rm   z2meta__convert_weight_to_int4pack.<locals>.<lambda>c                      r   r  r   r2   r  r2   r6   re   V  r   r   r1             r   )rV   rh   r   r^   re  r   r   rX  rh  inner_k_tilesr  r-  r2   r  r6    meta__convert_weight_to_int4packQ  s   



rQ  c                    s`   t   dkdd  t  jt ju  fdd  d} d} j||d ft jdS )Nr   c                   S   rk   rK  r2   r2   r2   r2   r6   re   g  rm   z:meta__convert_weight_to_int4pack_for_cpu.<locals>.<lambda>c                      r   Nzexpected w to be int32, got r   r2   r  r2   r6   re   j  r   r   r1   r   )rV   rh   r   r^   rX  r   r   re  rO  r2   r  r6   (meta__convert_weight_to_int4pack_for_cpue  s   




rS  c                    s   t  dkdd  jjdkrdnd t   k fdd t jt jt jt jfv fdd t jt j	u fdd jjdkrP
d	n
d	d
 }j
d	|jdS )Nr   c                   S   rk   Nzx must be a 2D tensorr2   r2   r2   r2   r6   re   v  rm   z*meta__weight_int4pack_mm.<locals>.<lambda>r   r  c                      r>  )Nzw must be a zD tensorr2   r2   )expected_dimr2   r6   re   x  r   c                      r   r  r   r2   r   r2   r6   re   {  r   c                      r   rR  r   r2   r  r2   r6   re     r   r   rL  r   )rV   rh   r   r&  rv   r^   rr  rs  rt  rX  r   r   )r;   rh  r  r  dim_nr2   )rU  rh  r;   r6   meta__weight_int4pack_mmt  s   


$rW  c                       t  dkdd  t   dkdd  t jt jt jt jfv fdd t  jt ju  fdd j	d 	djdS )	Nr   c                   S   rk   rT  r2   r2   r2   r2   r6   re     rm   z2meta__weight_int4pack_mm_for_cpu.<locals>.<lambda>c                   S   rk   rK  r2   r2   r2   r2   r6   re     rm   c                      r   r  r   r2   r   r2   r6   re     r   c                      r   r  r   r2   r  r2   r6   re     r   r   r   )
rV   rh   r   r^   rr  rs  rt  re  r   r   r  r2   rh  r;   r6    meta__weight_int4pack_mm_for_cpu     


rZ  c                    rX  )	Nr   c                   S   rk   rT  r2   r2   r2   r2   r6   re     rm   z;_weight_int4pack_mm_with_scales_and_zeros.<locals>.<lambda>c                   S   rk   rK  r2   r2   r2   r2   r6   re     rm   c                      r   r  r   r2   r   r2   r6   re     r   c                      r   rR  r   r2   r  r2   r6   re     r   r   r   )
rV   rh   r   r^   rr  rs  rt  rX  r   r   )r;   rh  r  qScaleqZerosr2   rY  r6   )_weight_int4pack_mm_with_scales_and_zeros  r[  r^  r4   r5   c                 C   r9   r0   r2   r3   r2   r2   r6   kai_roundup  s   r_  c           	         s   | dkrv||kr/d}d}d}d
dddd 
fddfd	d
}||||||S |d dkrx|| dkrzd}d}d}d
ddd  fdd} 	
fdddd  fdd fdd	|||||||S d S d S d S )Nr  rL  rM  r   c                 S   s   t || d}t | |S )Nr  r_  )r-  krsrkr_sr_roundedup4r2   r2   r6   kai_k_roundedup  s   
z3get_kai_packed_weight_size.<locals>.kai_k_roundedupc                    s>    | ||}|d dkrt d| ||d     S )Nr   r   zk_internal must be even, got rq  )r-  nrra  rb  
k_internal)rd  kai_num_bytes_biaskai_num_bytes_multiplier_rhskai_num_bytes_sum_rhsr2   r6   9kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0  s   z]get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0c                    s    t | || }| |||| S r?   r`  )r  r-  re  ra  rb  num_rows)rj  r2   r6   7kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0  s   z[get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0rN  r   c                    s   || dkrt d| d| d| dkr"t d| d d|  dkr3t d| d  dt| || }|||||| S Nr   bl (z) must be divisible by kr (ru   znr (z+) must be divisible by kai_nr_multiple_of (+) must be divisible by kai_bl_multiple_of (r   r_  )r  r-  re  ra  rb  blrk  )kai_bl_multiple_of;kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0kai_nr_multiple_ofr2   r6   9kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0  s"   
z]get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0c                    s   || dkrt d| d| d| dkr"t d| d d|  dkr3t d| d  d }| |}||}|||    S rm  rq  )r-  re  ra  rb  rq  num_bytes_multiplier_rhsnum_blocks_per_rownum_bytes_per_block)rr  #kai_get_bf16_datatype_size_in_bytesrt  kai_num_blocks_per_rowrg  kai_num_bytes_per_blockri  r2   r6   rs    s,   
z_get_kai_packed_weight_size.<locals>.kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0c                   S   rk   )Nr   r2   r2   r2   r2   r6   ry    r8  zGget_kai_packed_weight_size.<locals>.kai_get_bf16_datatype_size_in_bytesc                    s0   |  dkrt d| d  dt| || S )Nr   rn  ro  ru   rp  )r-  rq  rr  r2   r6   rz    s
   z:get_kai_packed_weight_size.<locals>.kai_num_blocks_per_rowc                    s.   |   dkrt d|  d  d| d | S )Nr   rn  ro  ru   r   rq  )rq  rv  r|  r2   r6   r{    s
   z;get_kai_packed_weight_size.<locals>.kai_num_bytes_per_blockr2   )	n_bitsr  K	groupsizekai_nrkai_krkai_srrl  ru  r2   )rr  ry  rs  rj  rd  rt  rz  rg  rh  r{  ri  r6   get_kai_packed_weight_size  s@   
/r  c                    s   t  jt ju  fdd t jj rE||kr|jt jks4||k rE|d dkrE|| dkrE|jt jkrEt	d|||} j
t|t jdS   |  }|d urW|| 7 } j
|t jdS )Nc                      r   r  r   r2   weightsr2   r6   re   0  r   z2meta__dyn_quant_pack_4bit_weight.<locals>.<lambda>rN  r   r  r   )rV   rh   r^   re  r   kleidiair   rZ   rt  r  r   r   r   )r  scales_zerosrQ  
block_sizein_featuresout_featurespacked_weight_sizer2   r  r6    meta__dyn_quant_pack_4bit_weight*  s$   

r  c                    sh   t  dkdd  t jt jkpjt jko k fdd d}j||jdS )Nr   c                   S   rk   )Nzinput must be a 2D tensorr2   r2   r2   r2   r6   re   M  rm   z-meta__dyn_quant_matmul_4bit.<locals>.<lambda>c                      s   dj  d  d S )NzPexpected input to be f32 or bf16 (bf16 requires block_size == in_features), got z with block_size=z and in_features=r   r2   r  r  inpr2   r6   re   Q  s   r   r   )rV   rh   r   r^   rr  rt  r   r   )r  packed_weightsr  r  r  rx  r2   r  r6   meta__dyn_quant_matmul_4bitE  s   
r  c                    s   t  dkdd  t jt jt jt jfv fdd t   dkdd  t  jt ju  fdd j	d 	djdS )	Nr   c                   S   rk   rT  r2   r2   r2   r2   r6   re   \  rm   z*meta__weight_int8pack_mm.<locals>.<lambda>c                      r   r  r   r2   r   r2   r6   re   _  r   c                   S   rk   rK  r2   r2   r2   r2   r6   re   a  rm   c                      r   )Nzexpected w to be int8, got r   r2   r  r2   r6   re   d  r   r   r   )
rV   rh   r   r^   rr  rs  rt  rW  r   r   )r;   rh  q_scalesr2   rY  r6   meta__weight_int8pack_mmZ  s   


r  c           	         s  t  dkfdd t  dkfdd t ddkfdd t tjfdd t tjfdd t |d	kd
d  t  dv  fdd d}d}jd d }jd d }tt 	||}|
||g |S )Nr   c                      r  )Nz1cdist only supports at least 2D tensors, X1 got: ra  r   r2   x1r2   r6   re   m  rf   z$meta_cdist_forward.<locals>.<lambda>c                      r  )Nz1cdist only supports at least 2D tensors, X2 got: ra  r   r2   x2r2   r6   re   q  rf   r   c                      r*  )Nz4X1 and X2 must have the same number of columns. X1: r   z X2: r   r2   )r  r  r2   r6   re   u  r  c                      r   )Nz3cdist only supports floating-point dtypes, X1 got: r   r2   r  r2   r6   re   y  r   c                      r   )Nz3cdist only supports floating-point dtypes, X2 got: r   r2   r  r2   r6   re   }  r   r   c                   S   rk   )Nz)cdist only supports non-negative p valuesr2   r2   r2   r2   r6   re     rm   )Nr   r1   r   c                      r  )Nz(possible modes: None, 0, 1, 2, but was: r2   r2   )compute_moder2   r6   re     r  r  )rV   rh   r   r   rQ   is_float_dtyper^   r   r   broadcast_shapesextendr   )	r  r  r  r  r1r2batch_tensor1batch_tensor2ry  r2   )r  r  r  r6   meta_cdist_forwardi  s@   











r  c                 C   s   |j d }|j d }|j d }|j d d }|j d d }	tt||	}
|
 }|||g t|
}|dksE|dksE|dksE|dkrJt|S |t|j krV|	|}tj
|tjdS )Nr   r  r   r   )r   r   rV   r  copyr  mathprod
zeros_likerA  r   r   )r  r  r  r  cdistc1r  r  r  r  rX  tensor1_expand_sizebatch_productr2   r2   r6   meta_cdist_backward  s   



 

r  c	                    s  t  jt jt jfv  fdd t jt jt jfv fdd t tjfdd d}	|rEt |	dkdd  |	d8 }	|	d}
d urzt |t	kdd  t j
dkfd	d t    k fd
d fdddd fdd}tdkr  d}  }|tkr |	d}nR d}nL||
|}|ttfv s|s̈ d}nd}|	}jd }|tkr|rt |dkdd  |d8 }|jd }n| }|
|||fS )Nc                      r   )Nz(expected indices to be long or int, got r   r2   )r   r2   r6   re     r   z$meta_embedding_bag.<locals>.<lambda>c                      r   )Nz(expected offsets to be long or int, got r   r2   )r  r2   r6   re     r   c                      r   )Nz/expected weight to be floating point type, got r   r2   )rO  r2   r6   re     r   r   r1   c                   S   rk   Nz1include_last_offset: numBags should be at least 1r2   r2   r2   r2   r6   re     rm   c                   S   rk   )Nz@embedding_bag: per_sample_weights only supported with mode='sum'r2   r2   r2   r2   r6   re     rm   c                      r  )Nz1expected per_sample_weights to be 1D tensor, got ra  r   r2   )per_sample_weightsr2   r6   re     r  c                      s   d   d    dS )Nz%expected per_sample_weights.numel() (z$ to be the same as indices.numel() (ru   r   r2   )r   r  r2   r6   re     s   c                    s    | ||o| ddkS Nr   r1   r   rB  r  r   padding_idx)is_fast_path_index_selectr2   r6   is_fast_path_index_select_scale  s   z;meta_embedding_bag.<locals>.is_fast_path_index_select_scalec                 S   s<   | j tjks| j tjko| ddko|ddko|dk S Nr1   r   )r^   rV   rZ   rX   r   )rB  r   r  r2   r2   r6   r    s   z5meta_embedding_bag.<locals>.is_fast_path_index_selectc                    s"   |d ur| |||S  | ||S r?   r2   r  )r  r  r2   r6   is_fast_path  s   z(meta_embedding_bag.<locals>.is_fast_pathr   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   )rV   rh   r^   r   r   rQ   r  r   r   MODE_SUMr   r   r   MODE_MAX	MODE_MEANr   )rO  r   r  scale_grad_by_freqr:  sparser  include_last_offsetr  num_bagsr   r  
offset2bagbag_sizemax_indicesfast_path_sumnumBagsr2   )r   r  r  r  r  rO  r6   meta_embedding_bag  st   








r  c                 G   sB   t | ||g|R  \}}}}t|dkr|| }||||fS )Nr   )r  r   r   r   )rO  r   r  rS   r   r  r  r  r2   r2   r6   meta_embedding_bag_forward_only  s   r  c                 C   s.   |r|S | j js| j jr| j S |rtjS | j S r?   )r^   r   r   rV   r   )r   r^   promote_int_to_longr2   r2   r6   _get_reduction_dtype  s   r  r   c                C   s6   t | |dd}t| j|}t| ||}| j||dS )NT)r  r   )r  rQ   r  r   r  r   )r   r!  r  r^   rm  ry  r2   r2   r6   meta_nansum  s   r  c                 C   s$   t | jtt|  }| |S r?   )rQ   r"  r   rg   r   r   r   )r   ry  r2   r2   r6   meta_median(  s   
r  c                 C   sL   t | dkrtd t| j|f}t| ||}| || j|tjdfS )Nr   zmedian CUDA with indices outputr   )	r   rQ   alert_not_deterministicr  r   r  r   rV   r   )r   r   r  ry  r2   r2   r6   meta_median_mode_dim0  s   
r  c                 C   r  r?   r2   r   r2   r2   r6   meta_logical_not_F  r  r  c                    s   t t|  kdd  tD ]\ t dk fdd qt|   }d| t| j fddttD }| |S )Nc                   S   rk   )NzZNumber of dimensions of repeat dims can not be smaller than number of dimensions of tensorr2   r2   r2   r2   r6   re   O  rm   zmeta_repeat.<locals>.<lambda>r   c                      ra   )Nz"Repeats cannot be negative, found r  r2   r2   )r   repr2   r6   re   T  rf   r  c                    s   g | ]
} | |  qS r2   r2   r  )padded_sizer  r2   r6   rO   [  r  zmeta_repeat.<locals>.<listcomp>)	rV   rh   r   r   r  rg   r   r   r   )r   r  num_new_dimensionstarget_sizer2   )r   r  r  r  r6   meta_repeatK  s   
r  c                 C   r  r?   r2   r   r2   r2   r6   
meta_zero__  r  r  c                 C   s   t |tjrt| j|j | S r?   )rn   rV   r
   rj   r   r   r   r2   r2   r6   meta_binop_inplaced  s   r  c                 C   sf   dd }dd }dd }|| r||rt d|| r$||s$t dt|tjr1t| j|j | S )	a*  
    Some checks for inplace ops.
    Checks for promotion rules for some dtypes.
    int.add/sub_(float) and bool.add/sub_(others) are rejected.
    Promoting in these in-place operations would require reallocating
    and copying over elements, hence not allowed.
    Checks for alpha param.
    c                 S       t | trt| jS t | tS r?   )rn   r   rQ   r%  r^   r   rp   r2   r2   r6   is_integeric     

z.meta_binop_inplace_alpha.<locals>.is_integericc                 S   r  r?   )rn   r   rQ   r  r^   r   r  r2   r2   r6   
is_floatic  r  z,meta_binop_inplace_alpha.<locals>.is_floaticc                 S   r  r?   )rn   r   rQ   is_boolean_dtyper^   r   r  r2   r2   r6   is_booleanic  r  z.meta_binop_inplace_alpha.<locals>.is_booleanicz]Promotion of int.add/sub_(float) in in-place ops are not possible due to element size change.z_Promotion of book.add/sub_(others) in in-place ops are not possible due to element size change.)r>  rn   rV   r
   rj   r   )r   r   ri  r  r  r  r2   r2   r6   meta_binop_inplace_alphau  s   r  c                 C      t | |tjdS Nr  rU   r   rR   r   r   ri  r2   r2   r6   meta_binop_alpha  s   r  c                 K      t | tjdS r  r  )r   r0  r2   r2   r6   
meta_round  s   r  c                    sl   t tj fdd tt jr&t tj fdd d S t tt fdd d S )Nc                      r  )Nz7: Expected input tensor to have an integral dtype. Got r   r2   )r  r   r2   r6   re     rf   z#shift_dtype_check.<locals>.<lambda>c                      r  )Nz6: Expected shift value to have an integral dtype. Got r   r2   r  r  r2   r6   re     rf   c                      s     d S )Nz): Expected shift value to be an int. Got r2   r2   r  r2   r6   re     r  )rV   rh   rQ   r%  r^   rn   r
   r   r  r   r  r2   r  r6   shift_dtype_check  s   

r  c                 C      t d| | t| |tjdS )Nrshiftr  r  rU   r   rR   r  r2   r2   r6   meta_rshifts     r  c                 C   r  )Nlshiftr  r  r  r2   r2   r6   meta_lshifts  r  r  c                 C   s   |  | jS r?   r^  r   r2   r2   r6   	meta_zero  s   r  c                 C   r  r?   r2   r   r  r2   r2   r6   
meta_fill_  r  r  c                 C   r
  r?   r  r  r2   r2   r6   	meta_fill     
r  c                 C   r  r?   r2   r   r2   r2   r6   
meta_relu_  r  r  c                 C   r  r  r  r  r2   r2   r6   meta__add_relu     r        ?UUUUUU?c                 C   r
  r?   r  r   noiselowerr  rF  r   r2   r2   r6   meta_rrelu_with_noise  s   
r  c                 C   s   t | t |fS r?   r  r  r2   r2   r6    meta_rrelu_with_noise_functional  s   r  c                 C   r  r?   r2   )r   r  r  rF  r   r2   r2   r6   meta_rrelu_with_noise_	  s   r  c                 C   r
  r?   r  r   r   r   
accumulater2   r2   r6   meta_index_put  r  r  c                 C   s   t | j|j | S r?   rj   r   )r   r  valuer2   r2   r6   meta_masked_fill_  s   r  c                 C   s    |  |  jt| d}|S r   )r   r   r?  rQ   r   )r   r  r  masked_scaler2   r2   r6   meta__masked_scale  s   r  c                    s@   t |jt jt jfv dd  t  jjk fdd  S )Nc                   S   rk   )NzMask must be bool or uint8r2   r2   r2   r2   r6   re   &  rm   z&meta_masked_scatter_.<locals>.<lambda>c                      r  )NzEmasked_scatter: expected self and source to have same dtypes but got r   r   r2   r   r|  r2   r6   re   *  s
    )rV   rh   r^   r  re  )r   r  r|  r2   r  r6   meta_masked_scatter_#  s   
r  c                 C   s*   t | |\} }tj| tjd}t|||S r   )r%   rV   r   r   r  )r   r  r|  r   r2   r2   r6   meta_masked_scatter0  s   r  c                 C   s
   |  |S r?   r  )r   r  r,  r2   r2   r6   meta_masked_scatter_backward8  r  r  c                 C   r  r?   r2   r  r2   r2   r6   meta_index_put_=  r  r  c                    sn  ddl m}m} t  dkdd  t dkdd  t jjk fdd   } |d |d |d	 }d }	||	ft||d |d	 fd
d |r jtjkpr jtj	kow|tj
k}
t| jkp|
dd  |}n}|sd urt dkdd  t| fdd |S )Nr   )sym_andr  r/   c                   S   rk   r  r2   r2   r2   r2   r6   re   E  rm   z)common_meta_baddbmm_bmm.<locals>.<lambda>c                   S   rk   r  r2   r2   r2   r2   r6   re   F  rm   c                      r  )Nzexpected scalar type z but found r   r2   r+  r2   r6   re   I  r$  r   r1   c                	      r  r  r2   r2   r  r2   r6   re   W  s    c                   S   rk   )Nzfout_dtype only supported for torch.float32 output with float16/bfloat16 inputs or same as input dtypesr2   r2   r2   r2   r6   re   `  rm   c                   S   rk   )Nzself must be a 3D tensorr2   r2   r2   r2   r6   re   h  rm   c                      s   d  d   S )Nz*Expected an input tensor shape with shape z but got shape: r   r2   )r  self_baddbmmr2   r6   re   k  r$  )r<  r  r  rV   rh   r   r^   r   rs  rt  rr  r   r?  )r  r  is_bmmr  rS  r  r  r  res_rowsres_colssupported_out_dtyper   r2   )r  r  r  r  r  r  r  r6   common_meta_baddbmm_bmmB  sH   


r  c                 C   s   t | |dS )NTr  )r   r_  r2   r2   r6   meta_bmmq  s   r  c                 C   s   t | |d|dS )NT)rS  r  )r   r_  rS  r2   r2   r6   meta_bmm_dtypew  r  r  c                 C   s<   | | }| | }|dkrt |dk t |dk kr|d8 }|S r  )r  )r;   r<   qr=  r2   r2   r6   div_rtn}  s
    r  c                 C   sZ   t | | | ||d   d |r|d nd |d }|r+|d | | | kr+|d8 }|S r  )r  )	inputSize
kernelSizer  r  r   r)  r  
outputSizer2   r2   r6   pooling_output_shape_pad_lr  s*   

	r  c                    sl   t |dkdd  t dkfdd t d   d d k fdd t| | |S )Nr   c                   S   rk   )Nzstride should not be zeror2   r2   r2   r2   r6   re     rm   z&pooling_output_shape.<locals>.<lambda>c                      r  )Nz'pad must be non-negative, but got pad: r2   r2   padr2   r6   re     r  r1   r   c                      s   d d d  S )NzApad should be at most half of effective kernel size, but got pad=z, kernel_size=z and dilation=r2   r2   r)  r  r  r2   r6   re     s
   )rV   rh   r  )r  r  r  r   r)  r  r2   r  r6   r    s   r  c              	      sd    }	
tdkodkfdd tdko  dk fdd tdko1dkfdd ddkoFddk}|tjkrbt|dkoZ|oZd	dkfd
d n"t|d	krqddkrq|p}|dko}|o}d	dkfdd td kod kfdd tdkodk	
fdd d S )Nr   c                      rr   )Nz5kernel size should be greater than zero, but got kH: , kW: r2   r2   )r  r  r2   r6   re     rf   z$pool2d_shape_check.<locals>.<lambda>c                      rr   )Nz0stride should be greater than zero, but got dH: , dW: r2   r2   )r  r  r2   r6   re     rf   c                      rr   )Nz9dilation should be greater than zero, but got dilationH: , dilationW: r2   r2   )	dilationH	dilationWr2   r6   re     rf   r1   r   r  r/   c                         d    S )NzExpected 4D (batch mode) tensor expected for input with channels_last layout with optional 0 dim batch size for input, but got: r   r2   r  r2   r6   re     s    c                      r  )NzYExpected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got: r   r2   r  r2   r6   re     r  c                      s   d d d d  S )NzKpad should be smaller than or equal to half of kernel size, but got padW = z	, padH = z, kW = z, kH = r2   r2   )r  r  r  r  r2   r6   re     s    c                      s*   d d  d d d d dS NzGiven input size: (r;   z). Calculated output size: (z). Output size is too smallr2   r2   )r  r  r  r  r  r  r2   r6   re     s    )r   rV   rh   r   r?  )r   r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   
valid_dimsr2   )r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r6   r    sB   


r  r  r  r  r  r  r  r  pTpHpW	dilationTr  r  r  r  r  r  r  r  r  c              
      s  	j }tdkodkodkfdd tdko&dko& dk fdd tdko<dko<dkfdd t|dv 	fdd t|D ]|dkradkraqVt	dk	fd	d qV|rt
kokok
fd
d td kod kod kfdd tdkodkodk
fdd d S )Nr   c                         d d  d S )Nz5kernel size should be greater than zero, but got kT: z, kH: r  r2   r2   )r  r  r  r2   r6   re        z$pool3d_shape_check.<locals>.<lambda>c                      r&  )Nz0stride should be greater than zero, but got dT: z, dH: r  r2   r2   )r  r  r  r2   r6   re     s   c                      r&  )Nz9dilation should be greater than zero, but got dilationT: z, dilationH: r  r2   r2   )r  r%  r  r2   r6   re     r'  r  c                      r  )Nz/: Expected 4D or 5D tensor for input, but got: r  r2   )r  r   r2   r6   re      rf   r  c                      s     dj  d dS )NzZ: Expected input's non-batch dimensions to have positive length, but input has a shape of z and non-batch dimension z has length zero!)r   r   r2   )r  r   r   r2   r6   re   )  s
   c                      s*   d d  d d d d dS )Nzinput image (T: r  r  z ) smaller than kernel size (kT:  kH:  kW: ru   r2   r2   )r  r  r  r  r  r  r2   r6   re   3  s   r   c                      s(   d d d  d d d S )NzHpad should be smaller than or equal to half of kernel size, but got kT: r)  r(  z padT: z padW: z padH: r2   r2   )r  r  r  r#  r"  r$  r2   r6   re   ;  s   r1   c                      s6   d d d  d d d d d dS r   r2   r2   )r  r  r  r  r  r  r  r2   r6   re   C  s   )r   rV   rh   r   r   )r   r  r  r  r  r  r  r  r"  r#  r$  r%  r  r  r  r  r  r  r  r  r  r  r   r2   )r  r  r  r  r%  r  r  r   r  r   r  r  r  r  r  r  r  r  r  r#  r"  r$  r6   r    sJ   	"r  c                 C   s   | j }t| |||||||	|
|||||||||||| t|||d | t|||d | t|||d | t|||d | t|||d | t|||d | t|||d | t|||d | d S )Nr  r/   r   r1   r   r  r  )r   r  r   r  r  r  r  r  r  r  r"  r#  r$  r%  r  r  r  r  r  r  r  r  r  r   r2   r2   r6   max_pool3d_backward_shape_checkK  s@   r+  c                 C   s   | j }t| ||||||||	|
|ddd|||||||d t|||d | t|||d | t|||d | t|||d | d S )Nr1   Tr  r/   r   r*  )r   r  r  r  r  r  r  r  r  r"  r#  r$  r  r  r  r  r  r  r  r   r2   r2   r6   r    s:   r  c                 C   sB  dd }|d|\}}t t|dv dd  t|dkr#||}	}
n|d|\}	}
|d	|\}}|d
|\}}| d}| d}| d}t| }|t jkr^t |  dkdd  n|t jkrpt |  dv dd  nt ddd  t	||||	||}t	||||
||}t
| |||	|
|||||||||| |||fS )Nc                    r  )Nr  c                      r>  )Nzmax_pool2d: r  r2   r2   r  r2   r6   re     r   zEmax_pool2d_checks_and_compute_shape.<locals>.unpack.<locals>.<lambda>r   r1   r  r  r2   r  r6   r    r  z3max_pool2d_checks_and_compute_shape.<locals>.unpackr7  r  c                   S   rk   )NzOmax_pool2d: stride must either be omitted, a single int, or a tuple of two intsr2   r2   r2   r2   r6   re     rm   z5max_pool2d_checks_and_compute_shape.<locals>.<lambda>r   r   r  r)  r  r  r   r  c                   S   rk   )NzMnon-empty 4D (batch mode) tensor expected for input with channels_last layoutr2   r2   r2   r2   r6   re     rm   r  c                   S   rk   )Nz9non-empty 3D or 4D (batch mode) tensor expected for inputr2   r2   r2   r2   r6   re     rm   Fc                   S   rk   )NzAUnsupported memory format. Supports only ChannelsLast, Contiguousr2   r2   r2   r2   r6   re     rm   )rV   rh   r   r   rQ   r   r?  r   r   r  r  )r   r7  r   r  r)  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r2   r2   r6   r    sb   		









r  c                    s   t |||||\}tj jk fdd |jfdd}	|	  |	| t}
tjjjj	|
dS )Nc                      r  )NzExpected dtype z  for `gradOutput` but got dtype r   r2   r  r2   r6   re   !  r$  z7meta_max_pool2d_with_indices_backward.<locals>.<lambda>c                    s:   t | d   t | d  t | d  d S )Nr/   r   r1   )r  )r  )r  r   r  r  r2   r6   _check_dim_size'  s   z>meta_max_pool2d_with_indices_backward.<locals>._check_dim_sizer  )
r  rV   rh   r^   r   rQ   r   r   r   r~   )r  r   r7  r   r  r)  r  r   r  r,  r   r2   )r  r  r   r  r  r   r6   %meta_max_pool2d_with_indices_backward  s.   

r-  c                 C   s   t | |||||\}}}|  dkr| dnd}	t| }
|  dkr*|||g}n|	|||g}tj|| j| j|
dtj|tj	| j|
dfS r  )
r  r   r   rQ   r   rV   r   r^   r~   r   r  r2   r2   r6   meta_max_pool2d_with_indices8  s2   
r.  c           
         s  t jdv fdd j}t|d |D ] t  dk fdd qt tdkdd  t t|dkd	d  d
}dd|dkr[d}nd}t jjkdd  t jdkfdd d}d}d t ||kdd  t ||kdd  t  dk fdd t |d d  d kfdd t |d d  d kfdd  dkr|||d |d g}	n	||d |d g}	t j|	jj	dt j|	t j
j	dfS )Nr  c                      r   )Nz:fractional_max_pool2d: Expected 3D or 4D tensor, but got: r   r2   r   r2   r6   re   c  r   z,meta_fractional_max_pool2d.<locals>.<lambda>r/   r   c                      s   d   d  dS )Nz_fractional_max_pool2d: Expected input to have non-zero  size for non-batch dimensions, but got r  z emptyr   r2   )r   r   r2   r6   re   j  s
    r   c                   S   rk   )NzNfractional_max_pool2d: kernel_size musteither be a single int or tuple of Intsr2   r2   r2   r2   r6   re   q  rm   c                   S   rk   )NzOfractional_max_pool2d: output_size must either be a single int or tuple of Intsr2   r2   r2   r2   r6   re   v  rm   r  r  r   r  r1   c                   S   rk   )Nz6Expect _random_samples to have the same dtype as inputr2   r2   r2   r2   r6   re     rm   c                      r   )Nz1Expect _random samples to have 3 dimensions got, r   r2   )random_samplesr2   r6   re     r   c                   S   rk   )Nz=Expect _random_samples.size(0) no less then input batch size.r2   r2   r2   r2   r6   re     rm   c                   S   rk   )Nz<Expect _random_samples.size(1) equals to input channel size.r2   r2   r2   r2   r6   re     rm   c                      r>  )Nz/Expect _random_samples.size(2) equals to 2 got .r2   r2   )r   r2   r6   re     r   c                         dd  d  S )Nz%fractional_max_pool2d: kernel height r   z' is too large relative to input height r2   r2   )input_heightr7  r2   r6   re     r$  c                      r1  )Nz$fractional_max_pool2d: kernel width r1   z& is too large relative to input width r2   r2   )input_widthr7  r2   r6   re     r$  r   )rV   rh   r   r   r   r   r^   r   r   r~   r   )
r   r7  r  r/  r   input_channelsinput_batchr  cr   r2   )r   r2  r3  r7  r/  r   r6   meta_fractional_max_pool2d_  s   










r7  c                 C   s  t t|dv dd  |d }t|dkr|n|d }t|dkr$|n|d }t | p2t|dv dd  |s;|n|d }	|sC|nt|dkrK|	n|d }
|sS|nt|dkr[|	n|d }t t|dv dd  |d }t|dkrw|n|d }t|dkr|n|d }t t|dv d	d  |d }t|dkr|n|d }t|dkr|n|d }t | jd
v dd  | jdkr| dnd}| d}| d}| d}| d}t||||	||}t||||
||}t||||||}t| |||||	|
|||||||||||||d | jdkot| t j	k}| jdkr(||||f}n|||||f}| 
|}| j
|t jd}|rM|jt j	d}|jt j	d}||fS )Nr  c                   S   rk   NzMmax_pool3d: kernel_size must either be a single int, or a tuple of three intsr2   r2   r2   r2   r6   re     rm   z.meta_max_pool3d_with_indices.<locals>.<lambda>r   r1   r   c                   S   rk   NzQmax_pool3d: stride must either be omitted, a single int, or a tuple of three intsr2   r2   r2   r2   r6   re     rm   c                   S   rk   NzImax_pool3d: padding must either be a single int, or a tuple of three intsr2   r2   r2   r2   r6   re     rm   c                   S   rk   NzJmax_pool3d: dilation must be either a single int, or a tuple of three intsr2   r2   r2   r2   r6   re     rm   r  c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   r  r  r  r  r   zmax_pool3d_with_indices()r  r   r   )rV   rh   r   r   r   r  r  rQ   r   rW  r   r   r?  )r   r7  r   r  r)  r  r  r  r  r  r  r  r"  r#  r$  r%  r  r  r  r  r  r  r  r  r  r  r?  r   r   r   r2   r2   r6   meta_max_pool3d_with_indices  s   

  






r=  c                 C   s.  t t|dv dd  |d }t|dkr|n|d }	t|dkr$|n|d }
t | p2t|dv dd  |s;|n|d }|sC|	nt|dkrK|n|d }|sS|
nt|dkr[|n|d }t t|dv dd  |d }t|dkrw|n|d }t|dkr|n|d }t t|dv d	d  |d }t|dkr|n|d }t|dkr|n|d }t |jd
v dd  |d}|d}|d}|d}| d}| d}| d}t|| ||||	|
|||||||||||||||d |jdkot|t jk}|	|j
}|r|jt jd}|S )Nr  c                   S   rk   r8  r2   r2   r2   r2   r6   re   &  rm   z7meta_max_pool3d_with_indices_backward.<locals>.<lambda>r   r1   r   c                   S   rk   r9  r2   r2   r2   r2   r6   re   .  rm   c                   S   rk   r:  r2   r2   r2   r2   r6   re   6  rm   c                   S   rk   r;  r2   r2   r2   r2   r6   re   >  rm   r  c                   S   rk   r  r2   r2   r2   r2   r6   re   F  rm   r  r  r  r   z"max_pool3d_with_indices_backward()r  r   )rV   rh   r   r   r   r+  rQ   r   rW  r   r   r?  )r  r   r7  r   r  r)  r  r   r  r  r  r  r  r  r"  r#  r$  r%  r  r  r  r  r  r  r  r  r  r?  r  r2   r2   r6   %meta_max_pool3d_with_indices_backward  s   
  








r>  gridc                    s   t j jk fdd t jt jko jt jk fdd t jd  jd k fdd t  jd jd k fdd tdjD ]t j dkfd	d qPd S )
Nc                      r  )NzNgrid_sampler(): expected input and grid to be on same device, but input is on z and grid is on r  r2   r?  r   r2   r6   re   |  r  z+check_grid_sampler_common.<locals>.<lambda>c                      r  )NzTgrid_sampler(): expected input and grid to have torch.strided layout, but input has z and grid has )r}   r2   r@  r2   r6   re     r  r   c                      r  )NzZgrid_sampler(): expected grid and input to have same batch size, but got input with sizes  and grid with sizes r  r2   r@  r2   r6   re     r  r   r   c                      s   dj d  d j S )Nz+grid_sampler(): expected grid to have size r   z, in last dimension, but got grid with sizes )r   r   r2   r@  r2   r6   re     s   c                      r  )NzYgrid_sampler(): expected input to have non-empty spatial dimensions, but input has sizes r  r  r  r2   r  r2   r6   re     r  )rV   rh   r~   r}   rz  r   r   r   )r   r?  r2   )r?  r   r   r6   check_grid_sampler_commony  s,   
rB  c                   @   s   e Zd ZdZdZdZdS )GridSamplerInterpolationr   r1   r   N)rw   
__module____qualname__BILINEARNEARESTBICUBICr2   r2   r2   r6   rC    s    rC  interpolation_modec                    sP   t jdkoj jk fdd t jdko |tjjk dd  d S )Nr  c                      r  )Nzdgrid_sampler(): expected 5D input and grid with same number of dimensions, but got input with sizes rA  r  r2   r@  r2   r6   re     s
   z'check_grid_sampler_3d.<locals>.<lambda>c                   S   rk   )Nz<grid_sampler(): bicubic interpolation only supports 4D inputr2   r2   r2   r2   r6   re     rm   )rV   rh   r   rC  rH  r  )r   r?  rI  r2   r@  r6   check_grid_sampler_3d  s   

rJ  c           
      C   s:   |d }|rt j|t jd}nd }t j|t jd}	||	fS Nr   r   )rV   r  r   r   
r  r   r?  rI  padding_modealign_cornersr%  input_requires_gradr  	grad_gridr2   r2   r6   grid_sampler_2d_backward_meta  s   
rQ  c           
      C   s\   t | | t| || | jd }| jd }|jd }|jd }|jd }	| |||||	fS )Nr   r1   r   r/   )rB  rJ  r   r   )
r   r?  rI  rM  rN  r  Cout_Dout_Hout_Wr2   r2   r6   grid_sampler_3d  s   
	




rV  rP  c           
      C   sP   t || t||| |d }|rtj|tjd}nd }tj|tjd}	||	fS rK  )rB  rJ  rV   r  r  r   rL  r2   r2   r6   grid_sampler_3d_backward  s   
rW  c                 O   s8   | d}|st|}||d< tj| g|R i |S )Nr^   )r]   rQ   	get_dtyperV   r   )r   r  rS   r0  r^   r2   r2   r6   full  s
   

rY  c                 C   s   |t jkrJt |d u dd  t jd|d u r| jn|||d u r"| jn||d}| jr8||  | 	 | 
  n||  |  d |d |S tjj| |||||d}|d |S )Nc                   S   rk   )Nz9memory format option is only supported by strided tensorsr2   r2   r2   r2   r6   re     rm   zzeros_like.<locals>.<lambda>r   r  Tr  )rV   
sparse_coorh   r   r^   r~   	is_sparsesparse_resize_and_clear_r   
sparse_dim	dense_dimr   _coalesced_r,   r   defaultfill_)r   r^   r}   r~   r   r   rn  r2   r2   r6   r    s:   
	

	r  r|   c                C   B   |d u rt  }|d u rt  }|d u rt j}t j| ||||dS r  rV   r   get_default_devicerz  r   r   r^   r}   r~   r   r   r2   r2   r6   	meta_ones/     
rf  c                C   rb  r  rc  re  r2   r2   r6   
meta_zerosE  rg  rh  c                 C   r  r?   _scatter_meta_output)r   rB  r   r   r2   r2   r6   meta_select_scatter[     rk  c                 C   r  r?   ri  )r   rB  r   ry   rx   stepr2   r2   r6   meta_slice_scatter`  rl  rn  c                 C   s4   ddl m} || st| dkr|  S t| S )Nr   r:  r1   )r<  r;  rV   r=  r   rQ   clone_preserve_strides)r   r;  r2   r2   r6   rj  e  s   
rj  dim_post_exprwrap_scalarc                 C   sn   |dkr|st d| dd}| }|d }| |k s| |kr-t d|  d| d| d| dk r5| |7 } | S )	Nr   zdim_post_expr=z <= 0 but wrap_scalar is Falser1   zdim z out of bounds (rt   ru   rq  )r   rp  rq  r   r  r2   r2   r6   r   q  s   
r   c                 C   s   |   dkrdS | j| S r  r  )r  r   r2   r2   r6   ensure_nonempty_size  s   rr  c                    st   t  d}t  d}t||kdd  t|D ] kr7tttk fdd qd S )Nr1   c                   S   rk   )NzDIndex tensor must have the same number of dimensions as input tensorr2   r2   r2   r2   r6   re     rm   z$gather_shape_check.<locals>.<lambda>c                      s$   d dj  dj  d   S )Nz!Size does not match at dimension z expected index  to be no larger than self  apart from dimension r  r2   r   r   r   r   r2   r6   re     s    )r  r   rV   rh   r   rr  )r   r   r   	self_dims
index_dimsr2   ru  r6   gather_shape_check  s   rx  c                    sn   ddl m} t||  }|  dk}|s1t jtjkp$ jtj	k fdd t
| |  |  jS )Nr   rP  c                      r   )Nz8gather(): Expected dtype int32/int64 for index, but got r   r2   r   r2   r6   re     r   zmeta_gather.<locals>.<lambda>)r<  rQ  r   r   r   rV   rh   r^   r   r   rx  r   r   )r   r   r   sparse_gradrQ  wrapped_dimis_index_emptyr2   r   r6   meta_gather  s   
r|  c                 C   s   |r*| dkrdS | dkrdS | dkrdS | dkrdS | d	kr d
S t ddd  d S | dkr0dS | dkr6dS t ddd  d S )Nrp  
REDUCE_ADDr  REDUCE_MULTIPLYr0  REDUCE_MEANamaxREDUCE_MAXIMUMaminREDUCE_MINIMUMFc                   S   rk   )Nz=reduce argument must be either sum, prod, mean, amax or amin.r2   r2   r2   r2   r6   re     rm   z#get_operator_enum.<locals>.<lambda>addmultiplyc                   S   rk   )Nz/reduce argument must be either add or multiply.r2   r2   r2   r2   r6   re     rm   r@  )reduce_use_new_optionsr2   r2   r6   get_operator_enum  s,   r  c                    sp   ddl m} || dkr"t|jtjkp|jtjk fdd |d ur6t|j|jk fdd d S d S )Nr   )r  c                      
     dS )Nz((): Expected dtype int32/int64 for indexr2   r2   method_namer2   r6   re     r  z,scatter_gather_dtype_check.<locals>.<lambda>c                      r  )Nz0(): Expected self.dtype to be equal to src.dtyper2   r2   r  r2   r6   re     r  )r<  r  r   rV   rh   r^   r   r   )r  r   r   src_optr  r2   r  r6   scatter_gather_dtype_check  s   


r  c                 C   s
   t | dS r0   )r  r   r2   r2   r6   ensure_nonempty_dim     
r  c           
         s  ddl m} | dkrd S tt t kdd  t }t|D ]}| kr3q,t|}t|}t||k fdd q,d urtt t kdd  t|D ]}t|}t|}	t||	kfdd qfd S d S )Nr   rP  c                   S   rk   )NzCIndex tensor must have the same number of dimensions as self tensorr2   r2   r2   r2   r6   re     rm   z%scatter_shape_check.<locals>.<lambda>c                      s   dj  dj  d   S )NExpected index rs  rt  r  r2   )r   r   r   r2   r6   re     s    c                   S   rk   )NzBIndex tensor must have the same number of dimensions as src tensorr2   r2   r2   r2   r6   re     rm   c                      r  )Nr  z to be no larger than src r  r2   )r   r  r2   r6   re     r$  )	r<  rQ  r   rV   rh   r  r   r   rr  )
r   r   r   r  rQ  rv  r   index_d_sizeself_d_size
src_d_sizer2   )r   r   r   r  r6   scatter_shape_check  s>   



r  c                 C   sD   t ||  }td| || t| ||| |d ur t|| d S d S )Nscatter)r   r   r  r  r  )r   r   r   rB  r  r  rz  r2   r2   r6   scatter_meta_impl  s   r  c                 C   s   t | |||d | | jS Nr  r  r   r   r   r   r   rB  r2   r2   r6   meta_scatter_add  s   r  c                 C   s   t | |||d | S r  r  r  r2   r2   r6   meta_scatter_add_  r  r  c                 C   s0   t |tjr|nd }t| |||| | | jS r?   )rn   rV   r
   r  r   r   r   r   r   src_or_valuer}  rB  r2   r2   r6   meta_scatter  s   
r  c                 C   s(   t |tjr|nd }t| |||| | S r?   )rn   rV   r
   r  r  r2   r2   r6   meta_scatter_#  s   	r  queryr   r  	dropout_p	is_causalreturn_debug_maskc              	   C   sB  |  d}|  d}|  d}	|  d}
| d}t| }tj|||	ftj| jd}|rX|
dkr3dnd}t|	| }|dkrCd}n|dkrId}tj|||	|f| j| jd}n
tjd| j| jd}tj	j
rktj sqt| d	krtjd
tjdd}tjd
tjdd}ntjdtjdd}tjd
tjdd}||d d |	||||f	S )Nr   r1   r   r/   r   @         r   r2   r{   )r   rV   r   r   rZ   r~   r  ceilr^   r5  r6  r   r   r   r   r  )r  r   r  r  r  r  r  r   	num_headsmax_seqlen_batch_qhead_dimmax_seqlen_batch_k	attention	logsumexpblocksize_cmax_seqlen_k
debug_maskseedoffsetr2   r2   r6   (meta__scaled_dot_product_flash_attention1  sN   







r  	q_descale	k_descale	v_descalec
           
      C   s,   | j tjkr| tj} t| ||||||	S r?   )r^   rV   ru  r?  rt  r  )
r  r   r  r  r  r  r  r  r  r  r2   r2   r6   2meta__scaled_dot_product_flash_attention_quantizedq  s   r  	res_shape.c                    s   t jkrt}|S tg dfdddd fdd D } fddtt D }tj|jj	d	
|}|S )
N)r   r1   r   r/   c                    s      |  S r?   r  )idx)r  r2   r6   re     r   z,alloc_with_matching_layout.<locals>.<lambda>Tr   c                    s   g | ]} | qS r2   r2   )rL   r  )r  r2   r6   rO     r$  z.alloc_with_matching_layout.<locals>.<listcomp>c                    s   g | ]}  |qS r2   r   r  )	dim_orderr2   r6   rO     rP   r   )rg   r   rV   r   r   r   r   r   r^   r~   r   )r  r  rn  permuted_shapefinal_permuter2   )r  r  r  r6   alloc_with_matching_layout  s   

r  	attn_biascompute_log_sumexpc	              	   C   s   |  d}	|  d}
|  d}| d}| d}|	|
||f}t| |}tj|	|
|dftj| jd}tjdtjdd}tjdtjdd}||d d ||||d f	S )Nr   r1   r   r   r   r2   r{   )r   r  rV   r   rZ   r~   r   )r  r   r  r  r  r  r  r  r  r  r  S_QS_KVD_Vr  rn  
logsum_expr  r  r2   r2   r6   (meta__scaled_dot_product_cudnn_attention  s0   






r  c              	   C   s   d\}}	}
|   dkr|  \}}	}
}n|   dkr$|  \}	}
}d}ntd|d}|d}t| j}||d< t| t|}tj||	|
ftj	| j
d}tjd	tjd
d}tjd	tjd
d}||d d |
|||d f	S )N)r   r   r   r  r/   r1   zquery must be 3D or 4Dr  r   r   r2   r{   )r   r   r>  r   r   r  rg   rV   r   rZ   r~   r   )r  r   r  r  r  r  r  r  r  H_Qr  rT   r  r  r   rn  r  r  r  r2   r2   r6   5meta__scaled_dot_product_fused_attention_overrideable  s:   



r  )r  r  grad_input_maskr  	cum_seq_q	cum_seq_kmax_qmax_kphilox_seedphilox_offsetc                C   s@   t |}t |}t |}|d urt |nd }||||fS r?   r  )r  r  r   r  r  r  r   r  r  r  r  r  r  r  r  r  r  grad_qgrad_kr  grad_attn_biasr2   r2   r6   >meta__scaled_dot_product_fused_attention_overrideable_backward  s
   


r  c                 C   s(   t |}t |}t |}|||fS r?   r  )r  r  r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r2   r2   r6   'meta__scaled_dot_product_flash_backward  s   



r  	attn_maskc                 C   sR   |  d}|  d}|  d}	t| }
tj||	|ftj| jddd}|
|fS )Nr   r1   r   r   )r   rV   r   r   rZ   r~   r  )r  r   r  r  r  r  r  r   r  r  r  r  r2   r2   r6   0meta__scaled_dot_product_flash_attention_for_cpu7  s"   




r  c
                 C   sX   t j| d|j|jd}
t j| d|j|jd}t j| d|j|jd}|
||fS )Nr   r   r1   r/   r   )rV   empty_permutedr   r^   r~   )r  r  r   r  r   r  r  r  r  r  r  r  r  r2   r2   r6   9meta__scaled_dot_product_flash_attention_for_cpu_backwardY  s&   
r  dropout_mask
enable_gqac	                    sh   dd }	|	\|	|\}
}|	|\}}j \ }|j \}} fdd}| S )Nc                 S   s|   |   dkr| ddfS |   dkr:d}t|   d D ]	}|| j| 9 }q| || d| d| ddfS | d	fS )
Nr/   r   Tr  r1   r  r  r   F)r   rf  r   r   viewr   )r;   r   r   r2   r2   r6   	ensure_4d  s   &zBmeta__scaled_dot_product_attention_math_for_mps.<locals>.ensure_4dc                     s     f}   f}rV dkr(| d} |d}| |fS tjd d t| jdd  }tjd d t|jdd  }| |} ||}| |fS )Nr/   r   r  r1   r  )r   r   squeezer   r   r  )r   attnr   
attn_shaper   max_seq_lengthnum_headq_q_sizer  
unsqueezedvalue_head_sizer2   r6   sdpa_general_mps  s   

$$

zImeta__scaled_dot_product_attention_math_for_mps.<locals>.sdpa_general_mpsr  )r  r   r  r  r  r  r  r  r  r  k_rT   v_r  r2   r  r6   /meta__scaled_dot_product_attention_math_for_mps  s   r  c                 C   s   |  dd} | dd}| dd}| d}| d}	| d}
|d}tj||	|
|| j| jd}tjjrDtj	 rD	 |rA|	nd}n|rOt
|	d d nd}tj||
|ftj| jd}| dd}tjdtjd	d}tjdtjd	d}||||fS )
Nr1   r   r   r  r   r   rN  r2   r{   )r  r   rV   r   r^   r~   r5  r6  r   r   r  r  rZ   r   )r  r   r  r  r  r  r  r  r  rx  r  Kvrn  logsumexp_dimr  r  r  r2   r2   r6   ,meta__scaled_dot_product_efficient_attention  s*   



r  c                 C   s  | d}| d}| d}| d}| d}| d}tj||||fd|j|jd}tj||||fd|j|jd}tj||||fd|j|jd}d }|d ur|
d r| d}|d dkrb|n|d |d  }t|  }||d< tj||j|jd}|d	d |f }||||fS )
Nr   r1   r   r/   r  r   r   rM  .)r   rV   r  r^   r~   r   r   )r  r  r   r  r  r   r  r  r  r  r  r  r  r   r  r  r  
head_dim_vr  r  r  r  	grad_biaslastDimlastDimAligned	new_sizesr2   r2   r6   +meta__scaled_dot_product_efficient_backward  sF   









 
r  c                 C   s(   t |}t |}t |}|||fS r?   r  )r  r  r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r2   r2   r6   'meta__scaled_dot_product_cudnn_backward  s   



r  window_size_leftwindow_size_right	seqused_kalibi_slopesblock_tabler  c                 C   s  |d u r	|  dn| d }|d u r|  dn|}|d u r#| dn|}|  d}|  d}t| }|d u rFtj|||ftj| jd}n|  d}tj||ftj| jd}|	r|dkr_dnd}t|| }|dkrod}n|dkrud}tj||||f| j	| jd}n
tjd| j	| jd}d	\}}tj
jrtj rtjd
tjdd}tjd
tjdd}ntjdtjdd}tjd
tjdd}|||||fS )Nr   r1   r  r   r   r  r  r  NNr2   r{   r   )r   r   rV   r   r   rZ   r~   r  r  r^   r5  r6  r   r   r   r  )r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  total_qr  r  r  r  r  r2   r2   r6   meta__flash_attention_forward9  sR   




r  c                 C   s4   t |||||||||	|
||||||\}}}}}|S r?   )r  )r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rT   r  r2   r2   r6   0meta__flash_attention_forward_no_dropout_inplace  s&   r  c                 C   s<   | j tjkr| tj} t| |||||||||	|||||S r?   )r^   rV   ru  r?  rt  r  )r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r2   r2   r6   'meta__flash_attention_forward_quantized  s&   r  c                 C   s(   t |}t |}t |}|||fS r?   r  )r  r  r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  
grad_querygrad_key
grad_valuer2   r2   r6   meta__flash_attention_backward  s   



r  cu_seqlens_qcu_seqlens_kmax_seqlen_qr  custom_mask_typecausal_diagonalseqlen_kwindow_sizec                 C   s   |  d}|  d}| d}|  d}| d}tj||||| j| jd}|d ur1| dd n|}|}|d urC|d u rAtd|}|d urI|n|}|
rVt|d d nd}tj|||ftj| jd}tjdtj	d	d}tjdtj	d	d}||||||fS )
Nr   r1   r  r   r   z;max_seqlen_q must not be None when cu_seqlens_q is providedrN  r2   r{   )
r   rV   r   r^   r~   r   r  r  rZ   r   )r  r   r  rQ  r  r  r  r  r  r  r  r  r  r  r	  r  rx  r  r  r  rn  logsumexp_batch_dimactual_max_seqlen_qactual_max_seqlen_kr  r  r  r  r2   r2   r6   !meta__efficient_attention_forward  s2   




r  bias_requires_gradnum_splits_keyshared_storage_dqdkdvc                 C   sL  |rSt |jd |jd kdd  t |jd |jd kdd  t jg |jdd d|jd |jd R |j|jd	}|d
d}|d
d}|d
d}nt |}t |}t |}|d ur|d}|d dkrs|n|d |d  }t	| }||d< t j||j|jd	}|dd |f }nt jd|jd}||||fS )Nr1   c                   S   rk   )Nz,seqlen must match for `shared_storage_dqdkdvr2   r2   r2   r2   r6   re   R  rm   z4meta__efficient_attention_backward.<locals>.<lambda>r/   c                   S   rk   )Nz3embedding dim must match for `shared_storage_dqdkdvr2   r2   r2   r2   r6   re   V  rm   r   r  r   r   r  r   rM  .r2   r  )
rV   rh   r   r   r^   r~   r  r   r   r   )r  r  r   r  rQ  r  r  r  r  r  r  r  r  r  r  r  r  r  chunkr  r   r  r  r  r  r  r2   r2   r6   "meta__efficient_attention_backward6  s:   *



 r  scale_ascale_bscale_resultuse_fast_accumc                    s  dd }t  dko dkfdd t |jo$|jfdd tdks?tdks?td	kr4d
d }	dd }
dd }td	krt |	 p\|fdd t |
 pn|fdd t dd dkfdd t dd dkodd dkfdd j\ djt jkojt jkpjt j	kojt j	k}
 dkr
 dkrt jt jkoۈjt jkdd  nR|rOjt j	krd} d  nd}jt jkr d  d}t |}t|dd }|t| | |t| | 
 krA
 krAt  dd  t  dd  nt dfdd nt jt jko]jt jkdd  t  dkop dkfd d dkrddkrddkrdkrt  o d!d  ndkrӈdd  krt dkrn ndtdkrnadkrdd  krt dkrn n	dkrn9dtdkr&dd  krt dkr&n n	dkr&nt d fd"d |d ur;|nj}t jdd|jd#S )$Nc                 S      | t jt jt jt jt jfv S r?   rV   ru  float8_e5m2float8_e4m3fnuzfloat8_e5m2fnuzfloat4_e2m1fn_x2r   r2   r2   r6   is_fp8_or_fp4_type|     z2_check_scaled_mm_sizes.<locals>.is_fp8_or_fp4_typer   c                         d   d    S Nz%Inputs must be 2D but got self.dim()=z and mat2.dim()=r   r2   r_  r   r2   r6   re     r   z(_check_scaled_mm_sizes.<locals>.<lambda>c                      r  Nz?Expected both inputs to be fp8 or fp4 types but got self.dtype=z and mat2.dtype=r   r2   r!  r2   r6   re     r$  r   r   r   c                 S      | d | d ko| d dkS r  r2   r  r2   r2   r6   is_row_major     z,_check_scaled_mm_sizes.<locals>.is_row_majorc                 S      | d dko| d dkS r  r2   r  r2   r2   r6   is_col_major  r  z,_check_scaled_mm_sizes.<locals>.is_col_majorc                 S      |  ddkp|  ddkS r  r   	tensor_2dr2   r2   r6   has_zero_dim  r%  z,_check_scaled_mm_sizes.<locals>.has_zero_dimc                      r  Nz#self must be row_major, got stride r  r2   r   r2   r6   re     r  c                      r  Nz#mat2 must be col_major, got stride r  r2   r_  r2   r6   re     r  r1   rM  r   c                         d  d S NzBExpected self.size(1) to be divisible by 16, but got self.size(1)=r1   r   r2   r   r2   r6   re     rf   c                      r   Nz?Expected both dimensions of mat2 to be divisible by 16 but got r  r2   r.  r2   r6   re     r   c                   S   rk   )NzNFor tensorwise scaling, both scale_a and scale_b must be float (fp32) tensors.r2   r2   r2   r2   r6   re     rm   rN  r  r  c                   S   rk   )Nzscale_a must be contiguousr2   r2   r2   r2   r6   re     rm   c                   S   rk   )Nzscale_b must be contiguousr2   r2   r2   r2   r6   re     rm   Fc                	      s&   d  d   d d   d	S )NzTInvalid blockwise scaling configuration. For blockwise scaling, scale_a should have  elements, got z, scale_b should have r0  r  r2   )expected_a_sizeexpected_b_sizer  r  r2   r6   re     s   c                   S   rk   )NzKFor rowwise scaling, both scale_a and scale_b must be float (fp32) tensors.r2   r2   r2   r2   r6   re     rm   c                      s   d   d  S )NzLFor non-tensorwise scaling, scale tensors must be 2D, but got scale_a.dim()=z and scale_b.dim()=r   r2   r  r  r2   r6   re     r   c                   S   rk   )Nz@Both scale_a and scale_b must be contiguous for rowwise scaling.r2   r2   r2   r2   r6   re     rm   c                      s   d d d dt  d d	dt  d dt d d dt  d d	 dt  d d d	t d dt  d d	 dt  d d d
d dd dd dd d S )N}Invalid scaling configuration. For tensorwise scaling, both scales should be scalar. For rowwise scaling, scale_a should be (, 1), scale_b should be (1, >). For (BlockWise1x128, BlockWise128x128), scale_a should be (rt   r  ), scale_b should be (<). For (BlockWise1x128, BlockWise1x128), scale_a should be (z>). For (BlockWise128x128, BlockWise1x128), scale_a should be (). Got scale_a.size()=(r   r1   ) and scale_b.size()=(ru   r7   r   r2   )_krx  r  r  r  r2   r6   re   #  s>   r   )rV   rh   r   r^   r   r   r   r   float8_e8m0fnuru  r   rr  r  r7   rv  r   r~   )r   r_  r  r  rQ  r  rS  r  r  r$  r'  r+  is_blockwise_scalingblock_size_kblock_size_mnnum_k_blockspadded_num_k_blocks
_out_dtyper2   )	r?  r3  r4  rx  r_  r  r  r  r   r6   _check_scaled_mm_sizesr  s   
	


"






	... rG  c              	   C   s   t | |||||||S r?   )rG  )r   r_  r  r  rQ  r  rS  r  r2   r2   r6   meta_scaled_mm6  s   rH  scale_recipe_ascale_recipe_b	swizzle_a	swizzle_bc              
      sr
  dd }dd }t  dko dkfdd t |jo(|jfdd jd	 jd
  jd
 |jrO|jrOd} |9  dd |D }dd |D }rgdd D ntjgrudd D ntjgtdkstdkstdkr%dd }dd }dd }tdkrt | p|fdd t | p|fdd t 	d
d d	kfdd t 	d	d d	ko	d
d d	kfdd dt
t dt
t fd d!}dt
t dt
t fd"d#}dt
t dt
t fd$d%}dt
t dt
t fd&d'}dt
t dt
t fd(d)}dt
t dt
t fd*d+}dt
t dt
t fd,d-}dt
t dt
t fd.d/}|||rt 
d	  d
kod	  d
ko
d	 jt jkod	 jt jkd0d  n|||rt 
d	 jd	 ko
d	  ko
d	 jt jkod	  kod	 jt jk
fd1d n_|||r]
d	 jt jko܈d	 jt jk}
d	 jd	 kojd
  d2 kod	d
kod
kpjd
 d
kod
d
k}d	 		jd	 koF	jd
  d2 koF	d	d
koF	d
kpF	jd
 d
koF	d
d
k}t |oP|oP| 	fd3d n|||r
d	 jt jkosd	 jt jk}t d2 d4
d	 jd	 kojd
 d2 kod	d
kod
kpjd
 d
kod
d
k}d	 		jd	 ko	jd
  d2 ko	d	d
ko	d
kp	jd
 d
ko	d
d
k}t |o|o| 	fd5d n)|||r
d	 jt jkod	 jt jk}t d2 d4
d	 jd	 koNjd
  d2 koNd	d
koNd
kpNjd
 d
koNd
d
k}d	 		jd	 ko	jd
 d2 ko	d	d
ko	d
kp	jd
 d
ko	d
d
k}t |o|o| 	fd6d n|||r-t jjrtjd	 d7jd
  tjd
 d7jd	  tjn)tjd	 d2ttjd
 d7d4 tjd
 d2ttjd
 d7d4 tjt 
d	  ko
d	 jt jkod	  kod	 jt jkod	 kod	 k
fd8d n|||rtd2tt dd4 td2tt dd4 tjt 
d	  ko
d	 jt jkod	  kod	 jt jkod	 kod	 k
fd9d n|||rtd2tt dd4 td2tt dd4 tjt 
d	  ko	
d	 jt jko	
d
  d
ko	
d
 jt jko	d	  ko	d	 jt jko	d
  d
ko	d
 jt jko	d	 ko	d	 k
fd:d nt d; 
fd<d |d ur,|nj}t j|jd=S )>Nc                 S   r  r?   r  r   r2   r2   r6   r  S  r  z5_check_scaled_mm_sizes_v2.<locals>.is_fp8_or_fp4_typec                 S   s
   | t jkS r?   )rV   r  r   r2   r2   r6   is_fp4_type\  r  z._check_scaled_mm_sizes_v2.<locals>.is_fp4_typer   c                      r  r   r   r2   r!  r2   r6   re   a  r   z+_check_scaled_mm_sizes_v2.<locals>.<lambda>c                      r  r"  r   r2   r!  r2   r6   re   e  r$  r   r1   c                 S      g | ]}t |qS r2   r'   rL   sir2   r2   r6   rO   t  r$  z-_check_scaled_mm_sizes_v2.<locals>.<listcomp>c                 S   rN  r2   rO  rP  r2   r2   r6   rO   u  r$  c                 S   rN  r2   r(   rP  r2   r2   r6   rO   x  r$  c                 S   rN  r2   rR  rP  r2   r2   r6   rO   ~  r$  r   r   r   c                 S   r#  r  r2   r  r2   r2   r6   r$    r%  z/_check_scaled_mm_sizes_v2.<locals>.is_row_majorc                 S   r&  r  r2   r  r2   r2   r6   r'    r  z/_check_scaled_mm_sizes_v2.<locals>.is_col_majorc                 S   r(  r  r   r)  r2   r2   r6   r+    r%  z/_check_scaled_mm_sizes_v2.<locals>.has_zero_dimc                      r  r,  r  r2   r   r2   r6   re     r  c                      r  r-  r  r2   r.  r2   r6   re     r  rM  c                      r/  r0  r   r2   r   r2   r6   re     rf   c                      r   r1  r  r2   r.  r2   r6   re     r   recipe_arecipe_bc                 S   4   t | dkot |dko| d tjko|d tjkS r  )r   r'   
TensorWiserS  rT  r2   r2   r6   is_tensorwise     
z0_check_scaled_mm_sizes_v2.<locals>.is_tensorwisec                 S   rU  r  )r   r'   RowWiserW  r2   r2   r6   
is_rowwise  rY  z-_check_scaled_mm_sizes_v2.<locals>.is_rowwisec                 S   rU  r  )r   r'   BlockWise1x32rW  r2   r2   r6   is_mx  rY  z(_check_scaled_mm_sizes_v2.<locals>.is_mxc                 S   rU  r  )r   r'   BlockWise1x16rW  r2   r2   r6   is_nv_single_level  s   
z5_check_scaled_mm_sizes_v2.<locals>.is_nv_single_levelc                 S   sP   t | dko't |dko'| d tjko'| d tjko'|d tjko'|d tjkS )Nr   r   r1   )r   r'   r^  rV  rW  r2   r2   r6   is_nv  s   
z(_check_scaled_mm_sizes_v2.<locals>.is_nvc                 S   rU  r  )r   r'   BlockWise1x128rW  r2   r2   r6   is_1x128_1x128  rY  z1_check_scaled_mm_sizes_v2.<locals>.is_1x128_1x128c                 S   4   t | dkot |dko| d tjko|d tjkS r  )r   r'   ra  BlockWise128x128rW  r2   r2   r6   is_1x128_128x128  rY  z3_check_scaled_mm_sizes_v2.<locals>.is_1x128_128x128c                 S   rc  r  )r   r'   rd  ra  rW  r2   r2   r6   is_128x128_1x128  rY  z3_check_scaled_mm_sizes_v2.<locals>.is_128x128_1x128c                   S   rk   )Nz\For Tensorwise scaling, both scale_a and scale_b must be single element float (fp32) tensorsr2   r2   r2   r2   r6   re     rm   c                	      s:   dj d  dd   d j d  dd   d	S )Nz'For Rowwise scaling, scale_a must have r   z elements (got: z), and scale_b must have r1   ru   )r   r   r2   )r_  r  r  r   r2   r6   re     s
   
r  c                      sR   d d d  dj  d dj d d d  dj  d dj dS )Nz>For 1x128 x 1x128 blockwise scaling, scale a must have shape [rt   r  ] (got: ) and stride [1, )scale b must have shape [ru   r   r   r2   )r~  rx  r  sasbr2   r6   re     *   r  c                      sR   d dd  dj  d dj d d d  dj  d dj dS )Nz]For 128x128 x 1x128 blockwise scaling, L4 = {round_up(K / 128, 4)}, scale a must have shape [rt   r  rg  rh  ri  ru   rj  r2   r~  L4rx  r  rk  rl  r2   r6   re   7  rm  c                      sR   d d d  dj  d dj d dd  dj  d dj dS )Nz]For 1x128 x 128x128 blockwise scaling, L4 = {round_up(K / 128, 4)}, scale a must have shape [rt   r  rg  rh  ri  ru   rj  r2   rn  r2   r6   re   V  rm  rN  c                      sh   d  dd    d dd    dtj dd j dd j d d	d  dd  d
S )Nz!for MX scaling scale_a must have  (got: r   ) and scale_b must have z). Scales must have types z (for self: 	, mat_b: z) Must have swizzle type  (got self: ru   )r   rV   r@  r^   r2   expected_scale_a_elemsexpected_scale_b_elemsexpected_swizzler  r  rK  rL  r2   r6   re   q  s"   
c                      H   d  dd    d dd    d dd  dd  dS )	Nz.for single-level NV scaling scale_a must have rp  r   rq  ). Must have swizzle type rs  rr  ru   r  r2   rt  r2   r6   re        
c                      rx  )	Nz!for NV scaling scale_a must have rp  r   rq  ry  rs  rr  ru   r  r2   rt  r2   r6   re     rz  Fc                      s   d d d dt  d d	dt  d dt d d dt  d d	 dt  d d d	d
 d
 dd
 d dd
 d
 dd
 d d S )Nr6  r7  r8  rt   r  r9  r:  r;  r<  r   r1   r=  ru   r>  r2   )r~  rx  r  r  r  r2   r6   re     s2   r   )rV   rh   r   r^   r   r(   
NO_SWIZZLEr   r   r   r   r'   r   rr  r=   r5  r6  r7   SWIZZLE_32_4_4r@  ru  r   r~   )r   r_  r  rI  r  rJ  rQ  rS  rK  rL  r  r  rM  K_packed_multiplierr$  r'  r+  rX  r[  r]  r_  r`  rb  re  rf  types_ok
scale_a_ok
scale_b_okrF  r2   )r~  ro  rx  r  ru  rv  rw  r_  rk  rl  r  r  r   rK  rL  r6   _check_scaled_mm_sizes_v2F  s  	





"




 ,, ,, ,,






	r  rm  contraction_dimsc                 C   s   t | |||||||	|||dS )N)rQ  rS  rK  rL  r  )r  )r   r_  r  rI  rK  r  rJ  rL  rQ  rm  r  r  r2   r2   r6   meta_scaled_mm_v2  s   r  c                 C   s    t | ||||dd | | jS NT)r  r  r   r   r   rB  r}  r{  r2   r2   r6   meta_scatter_reduce_two  s   r  c                 C   s   t | ||||dd | S r  r  r  r2   r2   r6   meta_scatter_reduce__two  s   r  c                   sh   t d    k odkn   fdd   dkr&t j|t j jdS t j d|t j jdS )Nr   r   c                      r  )NzAThe probability distributions dimensions must be 1 or 2, but got r   r2   r  r2   r6   re     r  z"meta_multinomial.<locals>.<lambda>r1   r   )rV   rh   r   r   r   r~   r   )r   num_samplesreplacementr   r2   r  r6   meta_multinomial  s   
r  c                 C   s   d}| D ]}||9 }q|S r0   r2   )vsr=  vr2   r2   r6   multiply_integers  s   
r  c                    s   t tkfdd d  t t k fdd t tdd dd  D o9tdd D fdd d d \}}||gR S )Nc                         d  dt  S )Nz%It is expected output_size equals to , but got size r  r2   )num_spatial_dimsr  r2   r6   re     r$  z'upsample_common_check.<locals>.<lambda>r   c                      r  )Nz$It is expected input_size equals to r  r  r2   )expected_input_dimsr  r2   r6   re     r$  c                 s       | ]}|d kV  qdS r   Nr2   )rL   r  r2   r2   r6   rq     rb  z(upsample_common_check.<locals>.<genexpr>c                      rr   )NzDInput and output sizes should be greater than 0, but got input size z and output size r2   r2   )r  r  r2   r6   re     s
    )rV   rh   r   r(  )r  r  r  r  channelsr2   )r  r  r  r  r6   upsample_common_check  s   

*r  c                    sZ   t   dkpt  dd   fdd t  |dd} |jt	 dS )Nr   r1   c                      r  )Nz>Non-empty 3D data tensor expected but got a tensor with sizes r   r2   r  r2   r6   re     r  z$upsample_nearest1d.<locals>.<lambda>r  r   
rV   rh   r   r  r   r  r   r?  rQ   r   )r   r  scalesfull_output_sizer2   r  r6   upsample_nearest1d     


r  c           	         s   t   dkpt  dd   fdd t  |dd} |}t } j	\}}}} j
jdkr?|dk r?t j}|j|d	}|S )
Nr   r1   c                      r  Nz>Non-empty 4D data tensor expected but got a tensor with sizes r   r2   r  r2   r6   re     r  z$upsample_nearest2d.<locals>.<lambda>r   r  r   r  r   )rV   rh   r   r  r   r  r   rQ   r   r   r~   rv   r   
contiguous)	r   r  scales_hscales_wr  r   r   rT   
n_channelsr2   r  r6   upsample_nearest2d  s   



r  r  r  r  r  c                    st   t ||dd tjdkfdd tdD ]t  k fdd q|jt	dS )Nr   r  r  c                      r   NzFExpected grad_output to be a tensor of dimension 4 but got: dimension r   r2   r  r2   r6   re   C  r   z-upsample_nearest2d_backward.<locals>.<lambda>c                
      &   d d   d d  S )NzCExpected grad_output to have the same shape as output; output.size() = z but got grad_output.size(r   r2   r  r  r   r2   r6   re   H  s   r   )
r  rV   rh   r   r   r   r   r?  rQ   r   )r  r  r  r  r  r2   r  r6   upsample_nearest2d_backward1  s   

	r  c                    sZ   t   dkpt  dd   fdd t  |dd} |jt	 dS )Nr   r1   c                      r  )Nz>Non-empty 5D data tensor expected but got a tensor with sizes r   r2   r  r2   r6   re   Z  r  z$upsample_nearest3d.<locals>.<lambda>r/   r  r   r  )r   r  scales_dr  r  r  r2   r  r6   upsample_nearest3dT  r  r  c           
      C   s   t | t j| t jd}}|d ur_|d ur_t|ts$tdt| t|ts2tdt| |j}| }	t	||}t	||}|
||	 |
||	 t||d t||d ||fS ||fS )Nr   zvalues must be TensorLike, got z indices must be TensorLike, got )rc  rd  )rV   r   r   rn   r   r   rv   r   r   r    r   r"   )
r   stabler   
descendingr   r   r  r   r   
out_strider2   r2   r6   	meta_sortd  s    	



r  c                    s  t jdkfdd t jjkfdd dd urPt jdkfdd t  kfdd t jjkfdd t jdkfd	d d
   t   k fdd t tfddfD dd  d S )Nr   c                          j  dS Nz != 2r   r2   input_gatesr2   r6   re     r   z%rnn_cell_checkSizes.<locals>.<lambda>c                         j  d j  S Nr  r  r2   )hidden_gatesr  r2   r6   re     r  r1   c                      r  )Nz != 1r   r2   )
input_biasr2   r6   re     r   c                      s      d  S r  r  r2   )
gates_sizer  r2   r6   re     r  c                      r  r  r  r2   )hidden_biasr  r2   r6   re     r  c                      r  r  r   r2   )prev_hiddenr2   r6   re     r   r   c                
      s,      dd d d d  d
S )Nr  r   z * z // z (aka ru   )r   r   r2   )expected_prev_hidden_numelfactorr  r  r  r2   r6   re     s   , c                 3   s    | ]	}|j  j kV  qd S r?   r  rK   r  r2   r6   rq     s
    

z&rnn_cell_checkSizes.<locals>.<genexpr>c                   S   rk   )Nz%expected all inputs to be same devicer2   r2   r2   r2   r6   re     rm   )rV   rh   r   r   r   r   r(  )r  r  r  r  r  r  r2   )r  r  r  r  r  r  r  r  r6   rnn_cell_checkSizes  s8   





r  c                 C   sL   t | |||d| tj| tjd}tj|tjd}tj|tjd}|||fS )Nr  r   )r  rV   r   r   )r  r  cxr  r  	workspacehycyr2   r2   r6   _thnn_fused_lstm_cell_meta  s
   
r  c                 C   s(  t |dk}|rt |}|d }| jd }n|
r| jd n| jd }|
r)| jd n| jd }d}|r4dnd}|dkr<|n|}|rG||| g}n|
rP|||| gn|||| g}| |}|	| ||g}|d u rptjd| jd}n||}||	| ||g}|rdnd}| j|tjd}|||||fS )Nr   r1   r   r   r  r   )r   r   r   rV   r   r~   re  )r   rO  weight_stride0
weight_bufhxr  r:  hidden_size	proj_size
num_layersbatch_firstdropouttrainbidirectionalr  dropout_stateis_input_packed
seq_length
mini_batchbatch_sizes_sumnum_directionsout_sizer   r   
cell_shaper  r  reserve_shapereserver2   r2   r6   
_cudnn_rnn  s2   

r  c                 C   sX   d}|D ]}|  dkr||  7 }q| |f}t| |||||||d|||	|
|||S r  )r   r   r  )r   rO  r  r  r  r:  r  r  r  r  r  r  r  r  total_weight_elemsrh  r  r2   r2   r6   
miopen_rnn  s0   r  c                 C   s   |r| j d n| j d }|r| j d n| j d }|
}|r!|||gn|||g}| |}|d u r8tjd| jd}n||j }|d u rKtjd| jd}n||j }tjd| jtjd}||||fS )Nr1   r   r  r   )r   r   rV   r   r~   re  )r   w0w1w2w3hx_cx_r   r  r:  r  r  
has_biasesr  r  r  r  r  output_chanelsr   r   r  r  r  r2   r2   r6   mkldnn_rnn_layer  s    
r  c                    sT   | j dkrt dkp dk fdd d S t|  dk fdd d S )Nr   r   c                      r  )Nz4: Expected reduction dim -1 or 0 for scalar but got r2   r2   r   r  r2   r6   re   H  r  z'zero_numel_check_dims.<locals>.<lambda>c                      r  )Nz: Expected reduction dim z to have non-zero size.r2   r2   r  r2   r6   re   M  rf   )r   rV   r   r   )r   r   r  r2   r  r6   zero_numel_check_dimsD  s   
r  c                    sF   |d urt || }t||  d S t| dk fdd d S )Nr   c                      r  )Nz@: Expected reduction dim to be specified for input.numel() == 0.r2   r2   r  r2   r6   re   Y  r  z%check_argmax_argmin.<locals>.<lambda>)r   r   r  rV   rh   r   )r  r   r   r2   r  r6   check_argmax_argminR  s   

r  c                 C   sD   t d| | t| j|d ur|fnd }t| ||}| j|tjdS )Nargmaxr   )r  rQ   r  r   r  r   rV   r   )r   r   r  r!  r   r2   r2   r6   argmax_argmin_meta]  s   r  c                 C   s$   |t jkrt j}t jd||||dS )Nr2   r  )rV   jaggedrz  r   )r  r^   r}   r~   r   r2   r2   r6   scalar_tensore  s
   

r  c                 C   s   t ||  dd}|  dkrdn| |}t|dk t||kdd  t| j}t|dkr6|||< | || j|tj	dfS )NTrq  r   r1   c                   S   rk   )Nzk not in range for dimensionr2   r2   r2   r2   r6   re   w  rm   ztopk_meta.<locals>.<lambda>r   )
r   r   r   rV   rh   r   r   r   r   r   )r   r-  r   largestr   	sliceSizetopKSizer2   r2   r6   	topk_metaq  s   
r  c           
      C   s@   |d u r|d u rt d| }|  }	tj||	j|	j|	jdS )Nz;segment_reduce(): Either lengths or offsets must be defined)r^   r~   r}   )r   r  rV   r   r^   r~   r}   )
r  r   r  r}  r  r  r  r  data_contiggrad_contigr2   r2   r6   meta__segment_reduce_backward  s   r  c                    s   ddl m} t |  dd |  dkr|  nd}t||dk||k fdd t| jd   | j d d   }|rM|  dkrM|	 d | 
|| j
|tjdfS )	Nr   )r  Tr  r1   c                      r  )Nz9kthvalue(): selected number k out of range for dimension r2   r2   r   r2   r6   re     r  zkthvalue_meta.<locals>.<lambda>r   )r<  r  r   r   r   rV   rh   r   r   rF  r   r   )r   r-  r   r  r  dimSizer   r2   r   r6   kthvalue_meta  s   
$r  c                 C   s   | d ur| n|}t | dkdd  | }| d ur(t |  |kdd  |d ur8t | |kdd  t | |kdd  t | |kdd  t | dkdd  t | |d	 |d
  d kdd  d S )Nr   c                   S   rk   N r2   r2   r2   r2   r6   re     rm   z(checkLSTMBackwardSizes.<locals>.<lambda>c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   r   r1   r  c                   S   rk   r  r2   r2   r2   r2   r6   re     rm   )rV   rh   r   r   r   )grad_hygrad_cyr  r  r  defined_gradexp_sizer2   r2   r6   checkLSTMBackwardSizes  s   ,r  c           	      C   s`   | d u r
|d u r
dS t | |||| tj|td}tj|td}|r)|jdddnd }|||fS )NNNNr   r   F)r  )r  rV   r   legacy_contiguous_memory_formatrp  )	r  r  r  r  r  has_bias
grad_gatesgrad_cxr  r2   r2   r6   #_thnn_fused_lstm_cell_backward_impl  s   
r  c                 C   sf   d }d }d }|d r| |  }|d s|d r.| |d| df}| |d}|||fS )Nr   r1   r   r   r/  )r!  r   r"  r%  r  grad_weightr  r2   r2   r6   linear_backward  s   
r  c                    s   t jdkrjd ||  dkstdj d| dd   fdd	}jd ||  }jd
 | }jd | }g jd d |||R }|}|j| d}|S )Nr   r  r   z'Invalid input shape for pixel_shuffle: z with upscale_factor = c                 S   r<  r?   r=  r@  r2   r2   r6   rB    rC  z,meta_pixel_shuffle.<locals>.is_channels_lastc                      sL    rt dkrtjS tjS jtjdrtjS jtjdr$tjS d S )Nr   r   )r   rV   r   r?  rv  preserve_formatr2   rB  r   r2   r6   rJ    s   z.meta_pixel_shuffle.<locals>.pick_memory_formatr  r   r   )r   r   r   r   r?  )r   upscale_factorrJ  rR  HrWrr   r   r2   r  r6   meta_pixel_shuffle  s   $ 
r  c                 C   sZ   |  | j}| |j}| |j}| |j}| |j}| |j}|||||||fS r?   r^  )r   weight0weight1weight2weight3r  cx_tmpr   hy_cy_grad_output_r_optgrad_hy_r_optgrad_cy_r_optr   r:  r  r  r  r  r  r  r  r  diff_xdiff_hxdiff_cxdiff_w1diff_w2diff_br2   r2   r6   mkldnn_rnn_layer_backward  s   r  )	out_int32rightc                C   s   t j| |rt jnt jt jdS )Nr^   r   )rV   r   rX  r   r   r   
boundariesr  r  r2   r2   r6   meta_bucketize  s
   r  r  r  r  c                C   s   |j d|r
tjdS tjdS )Nr2   r   )r   rV   rX  r   r  r2   r2   r6   meta_bucketize_scalar   s   r  d   c                    s   dt dkrt fdd t dkr# r#td tt t fdd t dk fd	d tttfd
d tttfdd tkfdd tj	 j
jdS )Nzhistc()r   c                      r  )Nz%"histogram_cpu" not implemented for 'r  r   r2   r  r2   r6   re   5  r  zmeta_histc.<locals>.<lambda>r   z%_histc_cuda with floating point inputc                      s    dt   S )Nz#: argument 'bins' must be int, not r6  r2   binsr  r2   r6   re   ;  r  r   c                      r  )Nz: bins must be > 0, but got r2   r2   r  r2   r6   re   =  r  c                           dt  S )Nz%: argument 'min' must be Number, not r6  r2   )r  r   r2   r6   re   @  r  c                      r!  )Nz%: argument 'max' must be Number, not r6  r2   )r  r  r2   r6   re   D  r  c                      r  )Nz: max must be larger than minr2   r2   )r  r2   r6   re   F  r  r   )r   rV   rh   r   rQ   r  rn   r   r   r   r~   r^   )r   r   r   r  r2   )r   r  r   r  r   r6   
meta_histc.  s.   

r"  c                    sd   t   |dd}t  dkptdd   dd  D  fdd  |jt	 d	S )
Nr   r  r   c                 s   r  r  r2   )rL   r   r2   r2   r6   rq   \  rb  z,meta_upsample_bimode2d_aa.<locals>.<genexpr>r1   c                      r  r  r   r2   r  r2   r6   re   ]  r  z+meta_upsample_bimode2d_aa.<locals>.<lambda>r   )
r  r   rV   rh   r   r(  r   r?  rQ   r   )r   r  rN  r  r  r  r2   r  r6   meta_upsample_bimode2d_aaJ  s   
(

r#  c                    st   t ||dd tjdkfdd tdD ]tj   k fdd q|jt	dS )Nr   r  r  c                      r   r  r   r2   r  r2   r6   re   w  r   z4meta_upsample_bimode2d_aa_backward.<locals>.<lambda>c                
      r  )NzD
Expected grad_output to have the same shape as output; output.size(r  z
but got grad_output_size(r   r2   r  r2   r6   re   |  s    r   )
r  rV   rh   r   r   r   r   r?  rQ   r   )r  r  r  rN  r  r  r2   r  r6   "meta_upsample_bimode2d_aa_backwardd  s   

r$  c                 C   s\   t | dkdd  t | dkdd  t |jjdd  t |jjdd  d S )Nr1   c                   S   rk   )Nz%found_inf must be a 1-element tensor.r2   r2   r2   r2   r6   re     rm   z<_amp_foreach_non_finite_check_and_unscale_.<locals>.<lambda>c                   S   rk   )Nz%inv_scale must be a 1-element tensor.r2   r2   r2   r2   r6   re     rm   c                   S   rk   )Nz!found_inf must be a float tensor.r2   r2   r2   r2   r6   re     rm   c                   S   rk   )Nz!inv_scale must be a float tensor.r2   r2   r2   r2   r6   re     rm   )rV   rh   r   r^   r   )r   r3  	inv_scaler2   r2   r6   *_amp_foreach_non_finite_check_and_unscale_  s   r&  c                 C   r
  r?   r  )r   nanposinfneginfr2   r2   r6   
nan_to_num  r  r*  c                 C   s   | j tjtjtjtjhv rtd| j  d| j}t||}t||}||kr)| S t	| 
 }t	|  }|| || ||< ||< || || ||< ||< | || | S )Nz>torch.transpose_: in-place transposition is not supported for z layout)r}   rV   r{  
sparse_cscr|  
sparse_bscr   r   r   r   r   r   r   )r   dim0r  ndimsr   r   r2   r2   r6   rs    s(   

rs  c                 C   sz   | j }| jr"|  }|  }|dkr|dks!td| d| dn|  dkr0td| dt| d|dk r:dS dS )	Nr   r   zEt_ expects a tensor with <= 2 sparse and 0 dense dimensions, but got z sparse and z dense dimensionsz6t_ expects a tensor with <= 2 dimensions, but self is ra  r1   )r   r[  r]  r^  r   r   rs  )r   r.  r]  r^  r2   r2   r6   t_  s$   
r/  )r  r  sidesorterc                   s   t tjdkpjd d  jd d k fdd t d u p)jjkfdd t |dkp9| dd  |rCt jnt j}t t jrUt j |t j	dS t j
d	|jd
S )Nr1   r   c                      s   dt j dt  j S )Nztorch.searchsorted(): boundaries tensor should be 1 dimension or the first N-1 dimensions of boundaries tensor and input value tensor must match, but we got boundaries tensor z and input value tensor r&  r2   )r   sorted_sequencer2   r6   re     s
   z#meta_searchsorted.<locals>.<lambda>c                      s,   dt  j dd urt j S g  S )Nz[torch.searchsorted(): boundary and sorter must have the same size, but got boundary tensor z and got sorter tensor r&  r2   )r2  r1  r2   r6   re     s   r   c                   S   rk   )Nzetorch.searchsorted(): side and right can't be set to opposites, got side of left while right was Truer2   r2   r2   r2   r6   re     rm   r  r2   r   )rV   rh   r   r   rX  r   rn   r
   r   r   r   r~   )r2  r   r  r  r0  r1  r^   r2   )r   r2  r1  r6   meta_searchsorted  s&   
r3  c                    s(   t  t jt jt jfv fdd d S )Nc                      r  )Nz/Unsupported input type encountered for isin(): r2   r2   r   r2   r6   re      r  z3_check_for_unsupported_isin_dtype.<locals>.<lambda>)rV   rh   r  
complex128	complex64r   r2   r   r6   !_check_for_unsupported_isin_dtype   s   
r6  c                 C   s   |  || df}|S )Nr   r/  )r  r   num_weightsr  r  r  r2   r2   r6   meta_embedding_dense_backward   s   r8  c                 C   s:   |	rt | ||||||||
|
S t| ||||||||
|
S r?   )r,   _embedding_bag_sparse_backward!meta_embedding_bag_dense_backward)r  r   r  r  r  maximum_indicesr7  r  r:  r  r  r  r2   r2   r6   meta_embedding_bag_backward   s2   r<  c
                    sX   t  jt jt jt jt jfv  fdd |tkr t |d u  | 	df}
|
S )Nc                      r   )Nz$Unsupported input type encountered: r   r2   r  r2   r6   re   Q   r   z3meta_embedding_bag_dense_backward.<locals>.<lambda>r1   )
rV   rh   r^   rs  rt  rr  float64r  r   r   )r  r   r  r  r;  r7  r  r:  r  r  index_grad_weightr2   r=  r6   r:  B   s   
r:  c           
      C   s   |  d}t|tkdd  t|  dk t| dk | d}t| dk t| d|k | |f}	|	S )Nr1   c                   S   rk   )NzHembedding_bag_backward: per_sample_weights only supported for mode='sum'r2   r2   r2   r2   r6   re   f   rm   z@meta_embedding_bag_per_sample_weights_backward.<locals>.<lambda>r   r   )r   rV   rh   r  r   r   )
r  rO  r   r  r  r:  r  embedding_featuresr  r   r2   r2   r6   .meta_embedding_bag_per_sample_weights_backwardY   s   


rA  )assume_uniqueinvertc                C   sx   t t| tpt|tdd  t| tst j| |jd} t|ts*t j|| jd}t| j t|j t j| t j	dS )Nc                   S   rk   )Nz<At least one of elements and test_elements must be a Tensor.r2   r2   r2   r2   r6   re   v   rm   zmeta_isin.<locals>.<lambda>r  r   )
rV   rh   rn   r
   rG  r~   r6  r^   r   r  )elementstest_elementsrB  rC  r2   r2   r6   	meta_isinq   s   



rF  r  c                 C   s4   t | dkdd  t|tjd\}}t j||dS )Nr   c                   S   rk   )Nz,polygamma(n, x) does not support negative n.r2   r2   r2   r2   r6   re      rm   z meta_polygamma.<locals>.<lambda>r  r   )rV   rh   r   r   r  r   )r  r   rT   rN   r2   r2   r6   meta_polygamma   s   
rG  c                 C   s   t d)Nz.Tensor.item() cannot be called on meta tensors)r>  r   r2   r2   r6   meta_local_scalar_dense   rl  rH  c                 C   r
  r?   r  r   r2   r2   r6   silu   r  rI  c                 C   s    t | tjd\}}tj| |dS r  )r   r   r  rV   r   )r   rT   rN   r2   r2   r6   sigmoid   s
   
rJ  c                 C   sp  |   dk}|  dk}|r:|r|d| d|dg}nVt|d|dkdd  | d|dg}n;|rWt|d| dkdd  | d|dg}nt| d|dkdd  | d| d|dg}|py| j}tjjrd	|j }|d | d | | }||kr|d | |dg}	n|dg}	tj||	|| j	d
}
|
S tj
||| j	d
}
|
S )Nr   r   r1   c                   S   rk   Nz matrix batch sizes have to matchr2   r2   r2   r2   r6   re      rm   z2_create_grouped_mm_output_tensor.<locals>.<lambda>r   c                   S   rk   rK  r2   r2   r2   r2   r6   re      rm   c                   S   rk   )Nzbatched dimension has to matchr2   r2   r2   r2   r6   re      rm   rM  r   )r   r   rV   rh   r^   r5  r   itemsizer  r~   r   )r]  r_  offsrS  
mat1_is_2d
mat2_is_2dr  	alignmentsize_paddedr  r   r2   r2   r6    _create_grouped_mm_output_tensor   s>   


rR  mat_amat_brM  c	                    s  t d u d u kdd  d uod u}	|	rDt j}
t jjr0t j r0dt jdjv r0t j	}
t j
|
ko;j
|
kfdd nt j
t jkoQj
t jkfdd t  dv of dv fdd  d	k} d	k}|r~|st d
dkdd  |	rdd }dd }t |fdd t |fdd dd }|d |d d ur(d ur(t j
t jkrֈj
t jkpj
t jkoj
t jkfdd j
t jkoj
t jk d$ fdd	}d ur|r|rjd nd}|dd| |dd| t |d u dd  |s.|r]t d ufdd d ur\t  dkfdd t j
t jkfd d n
t d u d!d  t |d u d"d  t |d u p||t jkd#d  t|S )%Nc                   S   rk   )Nz,Either both scale factors are given, or noner2   r2   r2   r2   r6   re      rm   z)_meta_grouped_mm_common.<locals>.<lambda>gfx94r   c                      r`  )Nz5Expected inputs of E4M3 FP8 type but got mat_a.dtype= and mat_b.dtype=r0  r   r2   rS  rT  r2   r6   re      rP   c                      r`  )Nz1Expected inputs of BF16 type but got mat_a.dtype=rV  r0  r   r2   rW  r2   r6   re      rP   )r   r/   c                      s   d    d   S )Nz3Multiplicands must be 2D or 3D but got mat_a.dim()=z and mat_b.dim()=r   r2   rW  r2   r6   re      r   r   r   r  c                   S   rk   )Nz3contraction dimension of mat_a and mat_b must matchr2   r2   r2   r2   r6   re      rm   c                 S   s    |   }|d dko|d dkS Nr  r1   r   r  mat
mat_strider2   r2   r6   r$  !     z-_meta_grouped_mm_common.<locals>.is_row_majorc                 S   s    |   }|d dko|d dkS rX  r  rY  r2   r2   r6   r'  !  r\  z-_meta_grouped_mm_common.<locals>.is_col_majorc                         d   dd   S )NzNExpected mat_a tensor to be row major in the last two dimensions, got strides r  r  r2   )rS  r2   r6   re   !  rP   c                      r]  )NzQExpected mat_b tensor to be column major in the last two dimensions, got strides r  r  r2   )rT  r2   r6   re   !  rP   c                    s     d  d  }  d  dkr:  tdj d  kr:t  | dk fdd d S   dkrd d  tdj  krdt d  | dk fdd d S tdfdd d S )	Nr1   rM  r   c                      s   d d  d   dS )Nr   stride along % dim to be multiple of 16 bytes, got r0  r2   r2   end_dimmat_namer[  r2   r6   re   !  r  zF_meta_grouped_mm_common.<locals>.check_valid_strides.<locals>.<lambda>c                      s$   d d d  d d   dS )Nr  r^  r1   r_  r0  r2   r2   r`  r2   r6   re   %!     $ Fc                      s   d d j  dS )NzInvalid strides/sizes, got z for strides and z for sizes.r  r2   rY  r2   r6   re   *!  r$  )r   element_sizer   r  r   rV   rh   )rb  rZ  rP  r2   )ra  rZ  rb  r[  r6   check_valid_strides!  s*   
z4_meta_grouped_mm_common.<locals>.check_valid_stridesrS  rT  c                      r`  )NzhFor FP8 scales must both be float32, or for MXFP8 both scales must be float8_e8m0fnu. Got scale_a.dtype=z and scale_b.dtype=r0  r   r2   r5  r2   r6   re   7!  rP   r1   c                    s    dkrQt fdd r(t    kfdd d S t  dkfdd tjd j  kfdd d S td	dkfd
d tjd jd kfdd rtjjd kfdd j\ }}d}t|| dt|dtjd  kojd  k fdd d S t  dkfdd tjd jd  kfdd d S )Nr   c                      r>  )Nr  z to be contiguous.r2   r2   
scale_namer2   r6   re   B!  r   z>_meta_grouped_mm_common.<locals>.check_scale.<locals>.<lambda>c                         d d j  dj  S )NzKFor MXFP8, scale must have same number of dimensions as target tensor, but  has mat.ndim= and scale.ndim=r   r2   rZ  r  rg  r2   r6   re   K!      r1   c                         d d    dS )Nr  z to be 1D tensor, but got 	D tensor.r   r2   r  rg  r2   r6   re   P!  rP   r   c                      s(   d d j    dj d  dS )Nr  z	 to have r2  r   z
 elements.r  r2   )rZ  r  scale_multiplierrg  
scaled_dimr2   r6   re   T!     ( r   c                      r>  )Nr  z( to be contiguous in the last dimension.r2   r2   rf  r2   r6   re   Y!  r   c                      s$   d d j d  dj d  dS )Nr  z batch dimension to be r   , got r0  r  r2   rk  r2   r6   re   ]!  rc  c                      rh  )Nz0For MXFP8, 3d tensor should have 2d scales, but ri  rj  r   r2   rk  r2   r6   re   d!  rl  rN  r  r  c                      s$   dj  d  d  dj  S )NzFor MXFP8, expected mat.shape=z to have scale shape of (,z), but got r  r2   )G	blocked_K	blocked_NrZ  r  r2   r6   re   n!  rc  c                      rm  )Nr  z to be 2D tensor, but got rn  r   r2   ro  r2   r6   re   s!  rP   c                      s(   d d j d   dj d  dS )Nr  z non-batch dimension to be r1   rs  r0  r  r2   )rZ  r  rg  rq  r2   r6   re   w!  rr  )r   rV   rh   rv  r   r   r   r=   )rg  r  rZ  rq  rp  r~  r  r  )is_mxfp8)ru  rv  rw  rZ  r  rp  rg  rq  r6   check_scale>!  s^   




z,_meta_grouped_mm_common.<locals>.check_scaler  r  c                   S   rk   )Nz:Scale result tensor provided, but it is not supported yet.r2   r2   r2   r2   r6   re   !  rm   c                      s   d    d   dS )Nz/Offsets tensor not provided, but is needed for zD/zD multiplicand layouts.r   r2   rW  r2   r6   re   !  rl  c                      r  )Nz.Offsets tensor must be 1D, but got offs.dim()=r0  r   r2   rM  r2   r6   re   !  rf   c                      r  )Nz7Offsets tensor must be integer (int32) tensor, but got r0  r   r2   rz  r2   r6   re   !  r  c                   S   rk   )NzJOffsets tensor provided, but is not needed for 3D/3D multiplicand layouts.r2   r2   r2   r2   r6   re   !  rm   c                   S   rk   )Nz2Bias tensor provided, but it is not supported yet.r2   r2   r2   r2   r6   re   !  rm   c                   S   rk   )Nz4If output dtype provided, it must be torch.bfloat16.r2   r2   r2   r2   r6   re   !  rm   r  )rV   rh   ru  r5  r6  r   r   get_device_propertiesgcnArchNamer  r^   rt  r   r   rr  r@  r   rX  rR  )rS  rT  r  r  rM  rQ  r  rS  r  scaled	fp8_dtypemat_a_is_2dmat_b_is_2dr$  r'  re  ry  rp  r2   )rx  rS  rT  rM  r  r  r6   _meta_grouped_mm_common   s   




	
"=




r  c              
   C   s   t | |d d ||d |dS )N)r  r  rM  rQ  r  rS  )r  )rS  rT  rM  rQ  rS  r2   r2   r6   meta_grouped_mm!  s   	r  c	           	      C   s$   |pt j}t| ||||||||d	S )N)r  r  rM  rQ  r  rS  r  )rV   rt  r  )	rS  rT  r  r  rM  rQ  r  rS  r  r2   r2   r6   meta_scaled_grouped_mm!  s   
r  c                 C   sx   t |t dkr| D ]}t| dkdd  q
g }| D ]}|d ur%|n|j}|jr/t|}||jd|d q|S )Ninfr   c                   S   rk   )Nz:_foreach_norm cannot compute infinity norm on empty tensorr2   r2   r2   r2   r6   re   !  rm   z#meta_foreach_norm.<locals>.<lambda>r2   r   )	rZ   rV   rh   r   r^   r   r   r3  r   )tensorsordr^   r  resultsrS  r2   r2   r6   meta_foreach_norm!  s   
r  r;   half_to_floatc                 C   s`   |r| j tjtjfvrtd| j  dtj| tjjd\}}|s#|n|}tj	| |tj
d}|S )Nz%half_to_float is True but x.dtype is z, expected half or bfloat16r  r  )r^   rV   rX   rt  r   rQ   r   r   rR   r   r   )r;   r   r  computation_dtyperN   rn  r2   r2   r6   softmax!  s   
r  c              	      s  t td dkfdd | jttd }| t |kfdd tdd D r|| }tD ]9 d  d   dk r_|   |j    }d  dk rw| d|j  d   }q>| S td  }t|D ]1 t d d       d   }t |dk fd	d |	| qt j
|| j| j| jt| d
S )Nr   r   c                      r4  )Nz1Length of pad must be even but instead it equals r  r2   r  r2   r6   re    "  r  z'_constant_pad_nd_meta.<locals>.<lambda>c                      s   dt  d  dS )Nz`Length of pad should be no more than twice the number of dimensions of the input. Pad length is z while the input has z dimensions.r  r2   )l_inpr  r2   r6   re   
"  s
    c                 s   s$    | ]}t |tjo|d kV  qdS r  )rn   rQ   IntWithoutSymInt)rL   r  r2   r2   r6   rq   "  r%  z(_constant_pad_nd_meta.<locals>.<genexpr>r1   c                	      s6   d    d  dd   d   d	S )NzThe input size z, plus negative padding r   r1   zG resulted in a negative output size, which is invalid. Check dimension z of your input.r2   r2   )r   r   l_diffr  pad_idxr2   r6   re   #"  s    
)r^   r~   r   r   )rV   rh   r   r   r(  r   narrowr   r   r3  r   r^   r~   r   r   )r   r  r  l_padc_input	new_shapenew_dimr2   )r   r   r  r  r  r  r6   _constant_pad_nd_meta!  sP   
  r  r  r  r  c           	      C   s   |   dkrtd|    d| j}|j}|jdkr!|d f}n|jdkr/|d |d f}n	g ||d R }| j}| j||dS )Nr   z'weight' must be 2-D, got z-Dr   r1   r   )r   r   r   r   r^   r   )	rO  r   r  r  r  weight_shapeindices_shaper   rS  r2   r2   r6   	embedding2"  s   	

r  max_lengthspadding_valuec                 C   s|   t |dkrtdt | dt |dkr tdt | d|d jd d }|d }||g| jdd  R }| |S )Nr1   z&Only one jagged dim is supported, got z offsetsz max_lengthsr   )r   r   r   r   )r   r  r  r  r  rR  ry  r2   r2   r6   $meta__jagged_to_padded_dense_forwardK"  s   
r  c                 C      t | t dd }|S )Nc                 S   r  r  rU   r   r  r   r2   r2   r6   _fc"  s   z)_create_unary_float_meta_func.<locals>._frH   r#   funcr  r2   r2   r6   _create_unary_float_meta_funcb"     r  c                 C   s   | j s	|j s	|j rtd|  dkr| | j| dfS | d}| d}|d}| |||}|
rO|rC| |||}||fS | ||||}||fS | d}||fS )NzP_native_multi_head_attention fake implementation does not support nested tensorsr   r1   )	is_nestedr  r   r   r   r   )r  r   r  	embed_dimr  
qkv_weightqkv_biasproj_weight	proj_biasr  need_weightsaverage_attn_weights	mask_typer  T
output_dimr   attn_weightsr2   r2   r6    native_multi_head_attention_faken"  s$   



r  c                 C   r  )Nc                 S   r  r  r  r:   r2   r2   r6   r  "  r  z*_create_binary_float_meta_func.<locals>._fr  r  r2   r2   r6   _create_binary_float_meta_func"  r  r  c                    s<   t   fdd} j d}||_ttt||}|S )Nc                    s(    | g|R i |}t | j|j | S r?   r  )r   rS   r0  r   rB   r2   r6   _fn"  s   z#_register_inplace_meta.<locals>._fnrT   )r   rw   rH   getattrr,   )rC   r  inplace_namer2   rB   r6   _register_inplace_meta"  s   r  c                    sr   t j jk fdd  g}ttr1jdkr,t jjkfdd | t|dtj	iS )Nc                      r  )Nr  z for `end`, but got dtype r   r2   )rx   ry   r2   r6   re   "  r$  zlerp.<locals>.<lambda>r   c                      r  )Nr  z for `weight`, but got dtype r   r2   )ry   rO  r2   r6   re   "  r$  rI   )
rV   rh   r^   rn   r   r   r3  rU   r   rR   )ry   rx   rO  rS   r2   )rx   ry   rO  r6   lerp"  s"   




r  )r  c                C   s   t | ||tjdS r  r  r   tensor1tensor2r  r2   r2   r6   addcmul"  s   
r  c                C   s8   t t|jot|j dd  t| ||tjdS )Nc                   S   rk   )N)zFInteger division with addcdiv is no longer supported, and in a future zErelease addcdiv will perform a true division of tensor1 and tensor2. z4The historic addcdiv behavior can be implemented as zA(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) zfor integer inputs and as z6(input + value * tensor1 / tensor2) for float inputs. z?The future addcdiv behavior is just the latter implementation: z4(input + value * tensor1 / tensor2), for all dtypes.r2   r2   r2   r2   r6   re   "  rm   zaddcdiv.<locals>.<lambda>r  )rV   rh   rQ   r%  r^   rU   r   rR   r  r2   r2   r6   addcdiv"  s   

r  c                  C   sB  i } dD ]}t | }|D ]}|| vr|| | |< qq|  D ]\}}t|tjjr*qt|ts8tdt| |	tj
jj| tj
| drY|t d v rXt| dq|jr]q| dv rdqd| v rqt|| qd| v r~t|| qd	| v rt|| qd
| v rt|| qt|| qd S )N)r{   post_autogradpre_autogradz$op_overload must be OpOverload, got CompositeImplicitAutogradr{   z is a CompositeImplicitAutograd op, we shouldn't register meta function for it. Instead, we should let the decomposition run and write meta kernels for the base operators.>   aten::cloneaten::copy_aten::rot90aten::_to_copyaten::empty_stridedaten::constant_pad_ndaten::as_strided_scatterzmkldnn::zmkl::zonednn::zquantized::)r   itemsrn   rV   _opsHigherOrderOperatorr   r   rv   py_impl_CDispatchKeyr.   %_dispatch_has_kernel_for_dispatch_keyr  r>  is_view2_meta_lib_dont_use_me_use_register_meta_for_mkldnnimpl/_meta_lib_dont_use_me_use_register_meta_for_mkl2_meta_lib_dont_use_me_use_register_meta_for_onednn5_meta_lib_dont_use_me_use_register_meta_for_quantized'_meta_lib_dont_use_me_use_register_meta)activate_meta_tabletypregistryopoop_overloadrC   r2   r2   r6   activate_meta#  sT   
r  r   )r+  r,  r  r?   )NNNFr   r1   r   r  )Tr  )r  )r  T)FF)TT)r;  )FTN)TFF)TF)r   )r  N)r>   r#  )r2   r   r  F)r2   r   FTN)Fr   FNFr   )NF)r   F)r  r  FN)NNNNN)r   NNr1   )NNF)r+  FFN)Nr+  FFN)r+  FNN)Nr+  FNNF)r+  FN)FN)NNNNNNN)FNNNN)NNNF)NNNNF)Nr   FNN)NNNN)r   TT)NNr   N)r  r   r   )r   )r   N)r   FF)r+  )NTTN(  r  collections.abcr   r   enumr   	functoolsr   typingr   typing_extensionsr   rV   torch._prims_commonr>  rQ   r   r	   r
   torch._decompr   r   r   r   
torch._opsr   torch._primsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   torch._prims_common.wrappersr   r    r!   r"   r#   r  r$   r%   torch.fx.experimentalr&   r  torch.nn.functionalr'   r(   torch.utilsr)   rE   r*   r+   opsr,   libraryLibraryr  r   r  r  r  r7   r=   rH   rU   r`   rj   linspacelogspacerz  r   taker`  r   r   r   r   cummaxcumminr   r   r   r   r   r  r   _fft_c2cr   r   r   _fft_r2cr   randpermgenerator_outr  r   r  randintr  r  low_outr  randr  _philox_key_splitr  _philox_key_fold_inr  r*  r.  r2  r4  r5  _fft_c2rr9  r  rE  rK  
unsqueeze_rN  _sparse_semi_structured_linearr#  r^   r\  _sparse_semi_structured_mmrh  _sparse_semi_structured_addmmrk  _cslt_sparse_mmrz  index_reducer  index_reduce_r  index_selectr  segment_reducer  r  	unary_outr  r   r  r   r  r  r  r  r  _assert_asyncr  msgr  _printr  _make_dep_tokenr  r  _functional_sym_constrain_ranger  r  (_functional_sym_constrain_range_for_sizer  _functional_assert_asyncr  r   r  r   r  r  r  r  _linalg_eighr  r  _linalg_eigvalslinalg_eigvalsr  
linalg_eigr  r  r  r  r  r  r  r  linalg_inv_exr  linalg_ldl_factor_exrg   r   linalg_ldl_solver)  	linalg_lur.  linalg_lu_factor_exr0  linalg_lu_solver4  	lu_unpackr9  rB  	linalg_qrrH  rK  rI  _linalg_svdrS  r&  r  r_  rp  linalg_solve_triangularrv  ry  r  _linalg_detr  r  r  r  reflection_pad1dr  replication_pad1dr  r  reflection_pad1d_backwardr  replication_pad1d_backwardr  r  reflection_pad2dr  replication_pad2dr  _weight_norm_interface_backwardr  reflection_pad2d_backwardr  replication_pad2d_backwardr  r  reflection_pad3dr  replication_pad3dr  reflection_pad3d_backwardreplication_pad3d_backwardr  _pdist_forwardrZ   r  _pdist_backwardr  baddbmmr  	bernoullir  
bernoulli_r  r  r	  poissonr  _fused_moving_avg_obs_fq_helperr  mmr  r  r   r;  rB  miopen_batch_normrO  convolutionrU  r  _has_mkldnnr  rV  _convolution_pointwiser\  _linear_pointwiser_  has_mklr  r   _mkl_linearrb  r  rc  qconv2d_pointwiseqconv_pointwiserG  ro  binarybinary_tensorry  qlinear_pointwiser}  r  linear_dynamic_fp16linear_relu_dynamic_fp16r  r  r  
max_pool2dr  int4mm_packed_weight_cpur  r  quantize_per_tensorr  
avg_pool2dr  r  avg_pool2d_backwardr  
avg_pool3dr  avg_pool3d_backwardr  _adaptive_avg_pool2dr  _adaptive_avg_pool3dr  _adaptive_avg_pool2d_backwardr  _adaptive_avg_pool3d_backwardr  r  adaptive_max_pool2dr  r  r  adaptive_max_pool3dr  r  r  repeat_interleaver  ro   r  r  r  r   _unsafe_indexr  convolution_backwardr)  addbmmr.  randint_liker1  _fused_adam__fused_adamw_rF  _fused_adamrI  _int_mmrJ  _convert_weight_to_int4packrQ  #_convert_weight_to_int4pack_for_cpurS  _weight_int4pack_mmrW  _weight_int4pack_mm_for_cpurZ  r^  r_  r  _dyn_quant_pack_4bit_weightr  _dyn_quant_matmul_4bitr  _weight_int8pack_mmr  _cdist_forwardr  _cdist_backwardr  _embedding_bagr  _embedding_bag_forward_onlyr  r  nansumr  median	nanmedianr  
dim_valuesr:  r   r  logical_not_r  repeatr  zero_r  mul_Scalardiv_logical_and_logical_or_logical_xor_r  add_sub_r  r  subr  rounddecimalsr  r  
__rshift__r  
__lshift__r  zeror  ra  r  fillr  relu_r  	_add_relur  rrelu_with_noiser  rrelu_with_noise_functionalr  rrelu_with_noise_r  	index_put_unsafe_index_putr  masked_fill_r  _masked_scaler  masked_scatter_r  masked_scatterr  masked_scatter_backwardr  
index_put_r  r  bmmr  	dtype_outr  r  r  r  r  r  r+  r  r   max_pool2d_with_indices_backwardr-  max_pool2d_with_indicesr.  fractional_max_pool2dr7  max_pool3d_with_indicesr=   max_pool3d_with_indices_backwardr>  rB  rC  rJ  grid_sampler_2d_backwardrQ  rV  rW  rY  r  onesrf  zerosrh  select_scatterrk  slice_scatterrn  rj  r   rr  rx  gatherr|  r  r  r  r  r  scatter_addr  scatter_add_r  r  rB  r  r}  value_reducer  scatter_r  #_scaled_dot_product_flash_attentionr  r  r  #_scaled_dot_product_cudnn_attentionr  0_scaled_dot_product_fused_attention_overrideabler  9_scaled_dot_product_fused_attention_overrideable_backwardr  ,_scaled_dot_product_flash_attention_backwardr  +_scaled_dot_product_flash_attention_for_cpur  4_scaled_dot_product_flash_attention_for_cpu_backwardr  *_scaled_dot_product_attention_math_for_mpsr  '_scaled_dot_product_efficient_attentionr  0_scaled_dot_product_efficient_attention_backwardr  ,_scaled_dot_product_cudnn_attention_backwardr  _flash_attention_forwardr  +_flash_attention_forward_no_dropout_inplacer  r  _flash_attention_backwardr  _efficient_attention_forwardr  _efficient_attention_backwardSymIntr  rG  
_scaled_mmrH  r  _scaled_mm_v2r  scatter_reducetwotwo_outr  scatter_reduce_r  multinomialr  r  r  r  _upsample_nearest_exact1dr  _upsample_nearest_exact2dr  "_upsample_nearest_exact2d_backwardr  _upsample_nearest_exact3dr   r  values_stabler  r  _thnn_fused_lstm_cellr  r  r  r  r  r  r  argminr  r  topkr  _segment_reduce_backwardr  kthvaluer  r   r  r  r  r  pixel_shuffler  r  	bucketize
Tensor_outr  
Scalar_outr  histcr"  _upsample_bilinear2d_aa_upsample_bicubic2d_aa_upsample_lanczos2d_aar#   _upsample_bilinear2d_aa_backward_upsample_lanczos2d_aa_backwardr$  r&  r*  rs  r/  searchsortedr3  r6  embedding_dense_backwardr8  _embedding_bag_backwardr<  _embedding_bag_dense_backwardr:  *_embedding_bag_per_sample_weights_backwardrA  isinrF  	polygammarG  _local_scalar_denserH  rI  rJ  rR  r  _grouped_mmr  _scaled_grouped_mmr  _foreach_normr  _softmaxr  constant_pad_ndr  r  _jagged_to_padded_dense_forwardr  r  _native_multi_head_attentionr  r  special_airy_aispecial_bessel_y0special_bessel_y1special_modified_bessel_i0special_modified_bessel_i1special_modified_bessel_k0special_modified_bessel_k1!special_scaled_modified_bessel_k0!special_scaled_modified_bessel_k1special_chebyshev_polynomial_tspecial_chebyshev_polynomial_uspecial_chebyshev_polynomial_vspecial_chebyshev_polynomial_w&special_shifted_chebyshev_polynomial_t&special_shifted_chebyshev_polynomial_u&special_shifted_chebyshev_polynomial_v&special_shifted_chebyshev_polynomial_wspecial_hermite_polynomial_hspecial_hermite_polynomial_hespecial_laguerre_polynomial_lspecial_legendre_polynomial_pr  r  r  r  lerp_addcmul_addcdiv_torch._refs.nn.functionaltorch._refs.specialr  r2   r2   r2   r6   <module>   s  <(
	:	1%
C





(
"
-

,,	
>



#
	

	











	




'



"

2
*
*
#7
 (,&
%
	
:

/Z&5?
+(
0



"


 %	&6&
Q,
H
XN



.


*" 1
$  
#c	







-




/


T	
\>	
6
L+&
T

b_( 


,$)	








?	
	+
3
	
	
		
$	

0,	
7	
	
M	
)	
*	
	
2	
< E
	

  n
	





'7,'

 "
0


*


"
	


/	 
Z	
(

60




H