o
    m9:jؠ                     @   s  d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZ g d
ZG dd deZG dd deZG dd deeZG dd deZG dd deeZG dd deZG dd deeZG dd deZG dd deeZG dd deZdS )    )AnyN)Tensor)
functionalinit)	ParameterUninitializedBufferUninitializedParameter   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr
   c                       s   e Zd ZU dZdZg dZeed< eed< edB ed< e	ed< e	ed	< 	
					ddddedededB de	d	e	de	ddf fddZ
dddZdddZdd Zdd Z		d fddZ  ZS )	_NormBasez,Common base of _InstanceNorm and _BatchNorm.   )track_running_statsmomentumepsnum_featuresaffiner   r   Nr   r   r   h㈵>皙?Tbiasr   returnc          
   	      s:  ||d}	t    || _|| _|| _|| _|| _| jr>ttj	|fi |	| _
|r7ttj	|fi |	| _n| dd  n| dd  | dd  | jr| dtj|fi |	 | dtj|fi |	 |  |  | dtj	ddtjid	d
 |	 D  |  n| dd  | dd  | dd  |   d S )Ndevicedtyper   weightrunning_meanrunning_varnum_batches_trackedr   r!   c                 S      i | ]\}}|d kr||qS r!    .0kvr(   r(   a/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/nn/modules/batchnorm.py
<dictcomp>Q       z&_NormBase.__init__.<locals>.<dictcomp>r   )super__init__r   r   r   r   r   r   torchemptyr"   r   register_parameterregister_bufferzerosonestensorlongitemsreset_parameters
selfr   r   r   r   r   r    r!   r   factory_kwargs	__class__r(   r-   r2   &   sL   

	z_NormBase.__init__c                 C   s.   | j r| j  | jd | j  d S d S )Nr	   )r   r#   zero_r$   fill_r%   r>   r(   r(   r-   reset_running_stats[   s
   
z_NormBase.reset_running_statsc                 C   s<   |    | jrt| j | jd urt| j d S d S d S N)rE   r   r   ones_r"   r   zeros_rD   r(   r(   r-   r<   c   s   
z_NormBase.reset_parametersc                 C   s   t rF   )NotImplementedErrorr>   inputr(   r(   r-   _check_input_dimj   s   z_NormBase._check_input_dimc                 C   s    dj di | jd| jd uiS )Nz{{num_features}, eps={eps}, momentum={momentum}, affine={affine}, bias={use_bias}, track_running_stats={track_running_stats}use_biasr(   )format__dict__r   rD   r(   r(   r-   
extra_reprm   s   z_NormBase.extra_reprc           
   	      s   | dd }|d u s|dk r4| jr4|d }	|	|vr4| jd ur*| jjtdkr*| jntjdtjd||	< t ||||||| d S )Nversionr   r%   metar   r'   )	getr   r%   r    r3   r9   r:   r1   _load_from_state_dict)
r>   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrQ   num_batches_tracked_keyr@   r(   r-   rT   u   s$   

z_NormBase._load_from_state_dictr   r   TTNNr   N)__name__
__module____qualname____doc___version__constants__int__annotations__floatboolr2   rE   r<   rL   rP   rT   __classcell__r(   r(   r@   r-   r      sN   
 


5
	r   c                       sd   e Zd Z						ddddedededB d	ed
ededdf fddZdedefddZ  Z	S )
_BatchNormr   r   TNr   r   r   r   r   r   r   r   c          
         s2   ||d}	t  j|||||fi |	d|i d S Nr   r   )r1   r2   r=   r@   r(   r-   r2      s   
z_BatchNorm.__init__rK   c              
   C   s   |  | | jd u rd}n| j}| jr1| jr1| jd ur1| jd | jd u r.dt| j }n| j}	 | jr8d}n
| jd u oA| jd u }	 t	
|| jrL| jrO| jnd | jrV| jrY| jnd | j| j||| jS )N        r	         ?T)rL   r   trainingr   r%   add_rg   r#   r$   F
batch_normr"   r   r   )r>   rK   exponential_average_factorbn_trainingr(   r(   r-   forward   s:   



z_BatchNorm.forwardr]   )
r_   r`   ra   re   rg   rh   r2   r   rt   ri   r(   r(   r@   r-   rj      s2    

rj   c                       s^   e Zd ZU eed< eed< 						ddd	d fd	d
Zd fddZdddZ  ZS )_LazyNormBaser"   r   r   r   TNr   r   c          	         s   ||d}t  jd||ddfi |ddi || _|| _| jr2tdi || _|r2tdi || _| jr[tdi || _tdi || _	t
j	d	dt
jidd | D | _d S d S )
Nr   r   Fr   r!   c                 S   r&   r'   r(   r)   r(   r(   r-   r.     r/   z*_LazyNormBase.__init__.<locals>.<dictcomp>r(   r0   )r1   r2   r   r   r   r"   r   r   r#   r$   r3   r9   r:   r;   r%   )	r>   r   r   r   r   r    r!   r   r?   r@   r(   r-   r2      s:   
	z_LazyNormBase.__init__c                    s(   |   s| jdkrt   d S d S d S )Nr   )has_uninitialized_paramsr   r1   r<   rD   r@   r(   r-   r<     s   z_LazyNormBase.reset_parametersc                 C   s   |   rO|jd | _| jr6t| jtstd| j| jf | j	d ur6t| j	ts.td| j	| jf | j
rI| j| jf | j| jf |   d S d S )Nr	   z-self.weight must be an UninitializedParameterz+self.bias must be an UninitializedParameter)rv   shaper   r   
isinstancer"   r   AssertionErrormaterializer   r   r#   r$   r<   rJ   r(   r(   r-   initialize_parameters  s.   
z#_LazyNormBase.initialize_parametersr]   r^   )	r_   r`   ra   r   rf   r2   r<   r{   ri   r(   r(   r@   r-   ru      s    
 	
,ru   c                   @      e Zd ZdZdddZdS )r   a  Applies Batch Normalization over a 2D or 3D input.

    Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the number of features or channels of the input). By default, the
    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
    At train time in the forward pass, the variance is calculated via the biased estimator,
    equivalent to ``torch.var(input, correction=0)``. However, the value stored in the
    moving average of the variance is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

    Args:
        num_features: number of features or channels :math:`C` of the input
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm1d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm1d(100, affine=False)
        >>> input = torch.randn(20, 100)
        >>> output = m(input)
    r   Nc                 C   4   |  dkr|  dkrtd|   dd S d S Nr      zexpected 2D or 3D input (got D input)dim
ValueErrorrJ   r(   r(   r-   rL   {     zBatchNorm1d._check_input_dimr^   r_   r`   ra   rb   rL   r(   r(   r(   r-   r   2  s    Hr   c                   @      e Zd ZdZeZdddZdS )r   a  A :class:`torch.nn.BatchNorm1d` module with lazy initialization.

    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``
    r   Nc                 C   r}   r~   r   rJ   r(   r(   r-   rL     r   z LazyBatchNorm1d._check_input_dimr^   )r_   r`   ra   rb   r   cls_to_becomerL   r(   r(   r(   r-   r         r   c                   @   r|   )r   a3  Applies Batch Normalization over a 4D input.

    4D is a mini-batch of 2D inputs
    with additional channel dimension. Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    r   Nc                 C   $   |  dkrtd|   dd S N   zexpected 4D input (got r   r   rJ   r(   r(   r-   rL        zBatchNorm2d._check_input_dimr^   r   r(   r(   r(   r-   r         Ir   c                   @   r   )r   a  A :class:`torch.nn.BatchNorm2d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``
    r   Nc                 C   r   r   r   rJ   r(   r(   r-   rL     r   z LazyBatchNorm2d._check_input_dimr^   )r_   r`   ra   rb   r   r   rL   r(   r(   r(   r-   r     r   r   c                   @   r|   )r   ah  Applies Batch Normalization over a 5D input.

    5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
    or Spatio-temporal Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, D, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``

    Shape:
        - Input: :math:`(N, C, D, H, W)`
        - Output: :math:`(N, C, D, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm3d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)
    r   Nc                 C   r   N   zexpected 5D input (got r   r   rJ   r(   r(   r-   rL   a  r   zBatchNorm3d._check_input_dimr^   r   r(   r(   r(   r-   r     r   r   c                   @   r   )r   a  A :class:`torch.nn.BatchNorm3d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``
    r   Nc                 C   r   r   r   rJ   r(   r(   r-   rL     r   z LazyBatchNorm3d._check_input_dimr^   )r_   r`   ra   rb   r   r   rL   r(   r(   r(   r-   r   f  r   r   c                       s   e Zd ZdZ							ddddeded	edB d
edededB deddf fddZdddZ	dddZ
dedefddZedddZ  ZS )r
   a  Applies Batch Normalization over a N-Dimensional input.

    The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over all
    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
    are learnable parameter vectors of size `C` (where `C` is the input size).
    By default, the elements of :math:`\gamma` are sampled from
    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
    The standard-deviation is calculated via the biased estimator, equivalent to
    `torch.var(input, correction=0)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
    Normalization or Spatio-temporal Batch Normalization.

    Currently :class:`SyncBatchNorm` only supports
    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
    Network with DDP.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, +)`
        eps: a value added to the denominator for numerical stability.
            Default: ``1e-5``
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        process_group: synchronization of stats happen within each process group
            individually. Default behavior is synchronization across the whole
            world
        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
            :attr:`affine` is ``True``). Default: ``True``

    Shape:
        - Input: :math:`(N, C, +)`
        - Output: :math:`(N, C, +)` (same shape as input)

    .. note::
        Synchronization of batchnorm statistics occurs only while training, i.e.
        synchronization is disabled when ``model.eval()`` is set or if
        ``self.training`` is otherwise ``False``.

    Examples::

        >>> # xdoctest: +SKIP
        >>> # With Learnable Parameters
        >>> m = nn.SyncBatchNorm(100)
        >>> # creating process group (optional)
        >>> # ranks is a list of int identifying rank ids.
        >>> ranks = list(range(8))
        >>> r1, r2 = ranks[:4], ranks[4:]
        >>> # Note: every rank calls into new_group for every
        >>> # process group created, even if that rank is not
        >>> # part of the group.
        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)

        >>> # network is nn.BatchNorm layer
        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
        >>> # only single gpu per process is currently supported
        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
        >>>                         sync_bn_network,
        >>>                         device_ids=[args.local_rank],
        >>>                         output_device=args.local_rank)
    r   r   TNr   r   r   r   r   r   process_groupr   r   c	                   s8   ||d}
t  j|||||fi |
d|	i || _d S rk   )r1   r2   r   )r>   r   r   r   r   r   r   r    r!   r   r?   r@   r(   r-   r2     s   

	zSyncBatchNorm.__init__c                 C   s$   |  dk rtd|   dd S )Nr   z expected at least 2D input (got r   r   rJ   r(   r(   r-   rL     r   zSyncBatchNorm._check_input_dimc                 C   s   | ddkrtdd S )Nr	   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rJ   r(   r(   r-   _check_non_zero_input_channels  s
   z,SyncBatchNorm._check_non_zero_input_channelsrK   c           	      C   s  |  | | | | jdu rd}n| j}| jr:| jr:| jdu r$td| jd | jdu r7d| j  }n| j}	 | jrAd}n
| j	du oJ| j
du }	 | jrR| jrU| j	nd}| jr]| jr`| j
nd}|op| joptj optj }|r|jjddd	tj fvrtd
tj  tjjj}| jr| j}tj|}|dk}|st|||| j| j||| jS |stdt|| j| j||| j|||	S )z(
        Runs the forward pass.
        Nrl   z$num_batches_tracked must not be Noner	   rm   Tcudahpuxpuz;SyncBatchNorm expected input tensor to be on GPU or XPU or zbn_training must be True)rL   r   r   rn   r   r%   ry   ro   itemr#   r$   r3   distributedis_availableis_initializedr    type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizerp   rq   r"   r   r   sync_batch_normapply)	r>   rK   rr   rs   r#   r$   	need_syncr   
world_sizer(   r(   r-   rt     s   





zSyncBatchNorm.forwardc              	   C   s   |}t |tjjjjrXtjj|j|j|j	|j
|j||jdud}|j
r?t  |j|_|j|_W d   n1 s:w   Y  |j|_|j|_|j|_|j|_t|drX|j|_| D ]\}}||| || q\~|S )aa  Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.

        Args:
            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
            process_group (optional): process group to scope synchronization,
                default is the whole world

        Returns:
            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
            instead.

        Example::

            >>> # Network with nn.BatchNorm layer
            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
            >>> module = torch.nn.Sequential(
            >>>            torch.nn.Linear(20, 100),
            >>>            torch.nn.BatchNorm1d(100),
            >>>          ).cuda()
            >>> # creating process group (optional)
            >>> # ranks is a list of int identifying rank ids.
            >>> ranks = list(range(8))
            >>> r1, r2 = ranks[:4], ranks[4:]
            >>> # Note: every rank calls into new_group for every
            >>> # process group created, even if that rank is not
            >>> # part of the group.
            >>> # xdoctest: +SKIP("distributed")
            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

        Nr   qconfig)rx   r3   nnmodules	batchnormrj   r
   r   r   r   r   r   r   no_gradr"   r#   r$   r%   rn   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechildr(   r(   r-   r   y  s8   $	


z$SyncBatchNorm.convert_sync_batchnorm)r   r   TTNNNr^   rF   )r_   r`   ra   rb   re   rg   rh   r   r2   rL   r   r   rt   classmethodr   ri   r(   r(   r@   r-   r
     sB    k

cr
   )typingr   r3   r   torch.nnr   rp   r   torch.nn.parameterr   r   r   
_functionsr
   r   lazyr   r   r   __all__r   rj   ru   r   r   r   r   r   r   r(   r(   r(   r-   <module>   s&   KON$O$O$