o
    m9:jY                  #   @   sD  d dl mZ d dlZd dlmZ ddlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZ ddgZG dd deZd	d
e de de de d	 e_							d)dee dee dee dee dedB dedB dedB dededB dedededededededdf"d dZd!d" Zdee dee dee dee dedB dedB dededededededededdfd#d$Zdee dee dee dee dedB dedB dededededededededdfd%d&Zdee dee dee dee dedB dedB deeB dedededededededdfd'd(ZdS )*    )castN)Tensor   )_default_to_fused_or_foreach_device_dtype_check_for_fused_differentiable_doc_foreach_doc_get_scalar_dtype
_get_value_maximize_doc_params_doc
_to_scalar_use_grad_for_differentiable_view_as_real
DeviceDict	OptimizerParamsTAdagradadagradc                       s   e Zd Z						ddddddedeeB d	ed
ededededB dedededB ddf fddZ fddZdddZ	dd Z
edddZ  ZS )r   {Gz?r   绽|=NF)maximizedifferentiablefusedparamslrlr_decayweight_decayinitial_accumulator_valueepsforeachr   r   r   returnc             
      sh  t |tr| dkrtdd|kstd| d|ks%td| d|ks0td| d|ks;td| d|ksFtd| ||||||||	|
d		}t || |
rm|	ratd
|rgtdd| _d| _| j	D ]A}|d D ]:}| j
| }|d rtjdt|d d|jdntjdt d|d< t|rt||n|}tj||tjd|d< qvqpd S )Nr   zTensor lr must be 1-element        zInvalid learning rate: zInvalid lr_decay value: zInvalid weight_decay value: z)Invalid initial_accumulator_value value: zInvalid epsilon value: )	r   r   r   r   r   r    r   r   r   z)`fused` does not support `differentiable`z0`fused` and `foreach` cannot be `True` together.Tr   r    is_fuseddtypedevicer'   stepmemory_formatsum)
isinstancer   numel
ValueErrorsuper__init__RuntimeError"_need_device_dtype_check_for_fused_step_supports_amp_scalingparam_groupsstatetorchzerosr	   r(   tensor
is_complexcomplex	full_likepreserve_format)selfr   r   r   r   r   r   r    r   r   r   defaultsgrouppr7   
init_value	__class__r#   Z/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/optim/adagrad.pyr2      sh   

zAdagrad.__init__c           
         s$  t  | d }| jD ]U}|dd  |dd |dd |dd }|d D ]6}| j|g }t|dkr_t|d s_t	|d }|d rUtj
|t|d	|jd
ntj
|t d|d< q)qt| j }t|dkovt|d d }|s|D ]}	tj
t	|	d t|d	d|	d< q{d S d S )Nr    r   Fr   r   r   r   r*   r$   r&   r)   )r1   __setstate__r6   
setdefaultr7   getlenr8   	is_tensorfloatr:   r	   r(   listvalues)
r?   r7   r   rA   rB   p_statestep_valstate_valuesstep_is_tensorsrD   r#   rF   rG   d   s@   

zAdagrad.__setstate__c                 C   s4   | j D ]}|d D ]}| j| }|d   q	qdS )z6Calls tensor.share_memory_() on the state sum tensors.r   r-   N)r6   r7   share_memory_)r?   rA   rB   r7   r#   r#   rF   share_memory   s   

zAdagrad.share_memoryc                 C   s,  d\}}|d D ]}|j d ur|d r t| ddr t| d| _||j jO }|t|O }|| ||j  | j| }	t	|	dkr|d rKt| |d r]tj
dt|d d	|jd
ntjdt d|	d< | jd }
t|rvt|
|
n|
}tj||tjd|	d< ||	d  ||	d  q||fS )N)FFr   r   r4   TFr   r#   r$   r&   r"   r)   r*   r   r+   r-   )gradgetattrr   r4   	is_sparser8   r;   appendr7   rJ   r9   r	   r(   r:   r@   r<   r=   r>   )r?   rA   params_with_gradgrads
state_sumsstate_stepshas_sparse_gradhas_complexrB   r7   r   rC   r#   r#   rF   _init_group   sP   





zAdagrad._init_groupc           
      C   s   d}|durt   | }W d   n1 sw   Y  | jD ]A}g }g }g }g }| |||||\}}	t|||||d |d |d |d ||d |d |d |	|d	 t| d
dt| ddd q |S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r    r   r   r   
grad_scale	found_inf)r   r   r   r   r^   r    r   r   r_   r   ra   rb   )r8   enable_gradr6   r`   r   rW   )
r?   closurelossrA   rZ   r[   r\   r]   r^   r_   r#   r#   rF   r*      s@   




zAdagrad.step)r   r   r   r   r   N)r!   NN)__name__
__module____qualname__r   rL   r   boolr2   rG   rU   r`   r   r*   __classcell__r#   r#   rD   rF   r      sN    

G
#.a[  Implements Adagrad algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta)
                \text{ (objective)}, \: \lambda \text{ (weight decay)},                          \\
            &\hspace{12mm}    \tau \text{ (initial accumulator value)}, \: \eta\text{ (lr decay)}\\
            &\textbf{initialize} :  state\_sum_0 \leftarrow \tau                          \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \tilde{\gamma}    \leftarrow \gamma / (1 +(t-1) \eta)                  \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm}state\_sum_t  \leftarrow  state\_sum_{t-1} + g^2_t                      \\
            &\hspace{5mm}\theta_t \leftarrow
                \theta_{t-1}- \tilde{\gamma} \frac{g_t}{\sqrt{state\_sum_t}+\epsilon}            \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.
    z
    Args:
        a  
        lr (float, Tensor, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        initial_accumulator_value (float, optional): initial value of the
            sum of squares of gradients (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)
        z	
        a  
        fused (bool, optional): whether the fused implementation (CPU and CUDA only) is used.
            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
            are supported. (default: None). Please note that the fused implementations does not
            support sparse or complex gradients.
    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html

    Fr   r[   r\   r]   r   ra   rb   r^   r    r   r_   r   r   r   r   r   r!   c                C   s   t dd |D std|du r|du rt| |	dd\}}|du r$d}|du r*d}|r5tj r5td|r@tj r@td|rJtj sJt}n|rTtj sTt}nt}|| ||||||||||	|
||d	 dS )
ztFunctional API that performs Adagrad algorithm computation.

    See :class:`~torch.optim.Adagrad` for details.
    c                 s   s    | ]	}t |tjV  qd S rf   )r.   r8   r   ).0tr#   r#   rF   	<genexpr>6  s    zadagrad.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers
r   r   r   r   r^   r   r   r_   ra   rb   )	allr3   r   r8   jitis_scripting_fused_adagrad_multi_tensor_adagrad_single_tensor_adagrad)r   r[   r\   r]   r   ra   rb   r^   r    r   r_   r   r   r   r   r   _funcr#   r#   rF   r     sJ   

c                 C   s   |   }t|||S rf   )sizer8   sparse_coo_tensor)rV   grad_indicesrN   ry   r#   r#   rF   _make_sparseg  s   r|   c             	   C   s  |d us|d urt dtj st|}t| |||ddD ]\}}}}|d7 }t|}|s0|n| }|dkrE|jr>td|j	||d}|d|d |   }|jr|
 }| }| }|t|||d ||}|  |	}|jt|||| | d qt|}|rt|}t|}t|}|j||dd	 |r| |	 }n| |	}|j||| d	 |rt|}t|}qd S )
N,Expected grad_scale and found_inf to be NoneT)strictr   r   z;weight_decay option is not compatible with sparse gradientsalpha   value)AssertionErrorr8   rr   rs   r   zipr
   rX   r3   addcoalesce_indices_valuesadd_r|   powsparse_masksqrt_r;   view_as_realaddcmul_sqrtaddcdiv_view_as_complex)r   r[   r\   r]   ra   rb   r   r   r   r   r^   r   r   r_   paramrV   	state_sumstep_tr*   clrr{   grad_valuesstd
std_valuesr;   r#   r#   rF   rv   l  sT   








rv   c                   s  |rt d|d us|d urt dt| dkrd S t  t| |||g}| D ]\\}}}}}ttt |}ttt |}ttt |}ttt |}|
oYt	dd |D }|rot
|||| ||	d|||||d q+|rwt||| |r~t|}tj s|d jrtj|tjdd	d
dd nt|d |dkr|rtj|||d ntj|||d} fdd|D }tj|||dd t|}t||	 |dks|rt|| |}nt||}t||| q+d S )Nz#_foreach ops don't support autogradr}   r   c                 s   s    | ]}|j V  qd S rf   )rX   )rl   rV   r#   r#   rF   rn     s    
z(_multi_tensor_adagrad.<locals>.<genexpr>Trp   g      ?cpu)r(   r   r   c                    s&   g | ]}  d t |d     qS )r   )r
   )rl   r*   r   r   r#   rF   
<listcomp>  s    z)_multi_tensor_adagrad.<locals>.<listcomp>r   )r   rJ   r   r   "_group_tensors_by_device_and_dtyperN   r   rM   r   anyrv   r   r8   _foreach_negcompileris_compilingis_cpu_foreach_add_r:   _foreach_add_foreach_addcmul__foreach_sqrt_foreach_mul__foreach_mul_foreach_addcdiv_)r   r[   r\   r]   ra   rb   r   r   r   r   r^   r   r   r_   grouped_tensorlistsdevice_params_device_grads_device_state_sums_device_state_steps_rw   device_paramsdevice_gradsdevice_state_sumsdevice_state_stepsdevice_has_sparse_grad	minus_clrr   	numeratorr#   r   rF   ru     s   


ru   c                C   s  | sd S |
s|rt d|rt d|d ur|j|ini }|d ur&|j|ini }t|tr9t|jdkr9|j|ind }t| |||g}| D ]\\}}\\}}}}}tt	t |}tt	t |}tt	t |}tt	t |}d\}}|d ur|
||j|dd}|d ur|
||j|dd}|d ur||vr|j|dd||< || }t|d tj||||||||	|||d	 |d urt||gt|  qHd S )
Nz5`fused` does not support sparse grad or complex paramz<adagrad with fused=True does not support differentiable=Truer   )NNT)non_blocking)r(   r   r   )r   r   r   r   r   ra   rb   )r3   r(   r.   r   strr   r   itemsr   rM   rH   tor8   r   _fused_adagrad__foreach_sub_rJ   )r   r[   r\   r]   ra   rb   r   r   r   r   r^   r   r   r_   grad_scale_dictfound_inf_dictlr_dictgrouped_tensorsr(   rw   r   r   r   r   r   r   r   r   device_grad_scaledevice_found_infr#   r#   rF   rt     sz   $
rt   )NNNFNFF)typingr   r8   r   	optimizerr   r   r   r   r	   r
   r   r   r   r   r   r   r   r   __all__r   __doc__rM   rj   rL   r   r|   rv   ru   rt   r#   r#   r#   rF   <module>   s6  @ P
8

J	

D	

o	
