o
    j9:j"?                  
   @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
Z
d dlm  m  m  mZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, g dZ-e*e.Z/eG dd dZ0G dd dZ1dee2B dB de3e	 de2fddZ4de&de5e2dB e6dB f fddZ7de0dee2B dB de3e	 de8e6e	f fddZ9dS )     N)Callable)	dataclassfield)Any)get_default_numa_optionsjustknobs_check)eventsmetrics)
WorkerSpec)create_healthcheck_server)_AliveCallbackProxyLocalElasticAgentTORCHELASTIC_HEALTH_CHECK_PORT)DefaultLogsSpecs	LogsSpecsSignalException)ChildFailedError)RendezvousParameters)parse_rendezvous_endpoint)
get_logger)NumaOptions)LaunchConfigelastic_launchlaunch_agentc                   @   sb  e Zd ZU dZeed< eed< eed< dZedB ed< dZe	ed< d	Z
e	ed
< dZe	ed< dZe	ed< eedZee	ef ed< dZeed< dZeed< dZeed< dZe	ed< dZe	dB ed< eedZee	e	f ed< dZe	dB ed< dZe	ed< dZedB ed< dZe	ed< dZee	 dB ed < dZee	 dB ed!< d"Z e!ed#< dZ"edB ed$< d%d& Z#dS )'r   a  
    Creates a rendezvous config.

    Args:
        min_nodes: Minimum amount of nodes that the user function will
                        be launched on. Elastic agent ensures that the user
                        function start only when the min_nodes amount enters
                        the rendezvous.
        max_nodes: Maximum amount of nodes that the user function
                        will be launched on.
        nproc_per_node: On each node the elastic agent will launch
                            this amount of workers that will execute user
                            defined function.
        rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
        rdzv_endpoint: The endpoint of the rdzv sync. storage.
        rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
        rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
            to be removed in future versions, see the note below. The default timeout is 900 seconds.
        run_id: The unique run id of the job (if not passed a unique one will be
                deduced from run environment - flow workflow id in flow - or auto generated).
        role: User defined role of the worker (defaults to "trainer").
        max_restarts: The maximum amount of restarts that elastic agent will conduct
                    on workers before failure.
        monitor_interval: The interval in seconds that is used by the elastic_agent
                        as a period of monitoring workers.
        start_method: The method is used by the elastic agent to start the
                    workers (spawn, fork, forkserver).
        metrics_cfg: configuration to initialize metrics.
        local_addr: address of the local node if any. If not set, a lookup on the local
                machine's FQDN will be performed.
        local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
        event_log_handler: name of the event logging handler as registered in
          `elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
        duplicate_stdout_filters: If non-empty, duplicates stdout to a file containing only lines
                                that match _any_ of the filter strings.
        duplicate_stderr_filters: If non-empty, duplicates stderr to a file containing only lines
                                that match _any_ of the filter strings.
        virtual_local_rank: Enable virtual local rank mode for workers (defaults to False).
                           When enabled, LOCAL_RANK is set to 0 for all workers and
                           CUDA_VISIBLE_DEVICES is adjusted so each worker accesses its
                           assigned GPU at device index 0.
        shutdown_timeout: Time in seconds to wait for graceful shutdown of workers before
                        sending SIGKILL. Can also be set via TORCH_ELASTIC_SHUTDOWN_TIMEOUT
                        environment variable. Defaults to 30 seconds.


    .. note::
        `rdzv_timeout` is a legacy argument that will be removed in future.
        Set the timeout via `rdzv_configs['timeout']`

    	min_nodes	max_nodesnproc_per_nodeN
logs_specs run_iddefault_rolerolerdzv_endpointetcdrdzv_backend)default_factoryrdzv_configsrdzv_timeout   max_restartsg?monitor_intervalspawnstart_methodlog_line_prefix_templatemetrics_cfg
local_addrnullevent_log_handlernuma_optionszSIGTERM,SIGINT,SIGHUP,SIGQUITsignals_to_handleduplicate_stdout_filtersduplicate_stderr_filtersFvirtual_local_rankshutdown_timeoutc                 C   s   d}| j dkr| j | jd< n
d| jvr|| jd< | jd u r!t | _| jd u r>tj r>tj | j	kr>t
 | _td| j | jd u rOttjdd| _d S | jdk r\td| j d S )	Ni  r'   timeoutzUsing default numa options = %rTORCH_ELASTIC_SHUTDOWN_TIMEOUT30r   z+shutdown_timeout must be non-negative, got )r(   r&   r   r   r3   torchcudais_availabledevice_countr   r   loggerinfor8   intosenvironget
ValueError)selfdefault_timeout rI   e/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/torch/distributed/launcher/api.py__post_init__|   s,   








zLaunchConfig.__post_init__)$__name__
__module____qualname____doc__rB   __annotations__r   r   r   strr!   r"   r$   r   dictr&   r   r(   r*   r+   floatr-   r.   r/   r0   r2   r3   r   r4   r5   listr6   r7   boolr8   rK   rI   rI   rI   rJ   r   .   s4   
 4r   c                   @   s2   e Zd ZdZdedeeB dB fddZdd ZdS )	r   a  
    Launches an torchelastic agent on the container that invoked the entrypoint.

        1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
           ``entrypoint`` can be a function or a command.
        2. The return value is a map of each worker's output mapped
           by their respective global rank.

    Usage

    ::

    def worker_fn(foo):
        # ...

    def main():
        # entrypoint is a function.
        outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
        # return rank 0's output
        return outputs[0]

        # entrypoint is a command and ``script.py`` is the python module.
        outputs = elastic_launch(LaunchConfig, "script.py")(args)
        outputs = elastic_launch(LaunchConfig, "python")("script.py")
    config
entrypointNc                 C   s   || _ || _d S N)_config_entrypoint)rG   rV   rW   rI   rI   rJ   __init__   s   
zelastic_launch.__init__c                 G   s   t | j| jt|S rX   )r   rY   rZ   rT   )rG   argsrI   rI   rJ   __call__   s   zelastic_launch.__call__)	rL   rM   rN   rO   r   r   rQ   r[   r]   rI   rI   rI   rJ   r      s    

r   rW   r\   returnc                 C   s@   t | tr| jS t | tr| tjkrtdd |D dS | S dS )a  Retrieve entrypoint name with the rule:
    1. If entrypoint is a function, use ``entrypoint.__qualname__``.
    2. If entrypoint is a string, check its value:
        2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
            which does not start with hifen letter (for example, "-u" will be skipped).
        2.2 otherwise, use ``entrypoint`` value.
    3. Otherwise, return empty string.
    c                 s   s     | ]}|d  dkr|V  qdS )r   -NrI   ).0argrI   rI   rJ   	<genexpr>   s    z'_get_entrypoint_name.<locals>.<genexpr>r   )
isinstancer   rL   rQ   sys
executablenext)rW   r\   rI   rI   rJ   _get_entrypoint_name   s   
	

rg   rdzv_parametersc                 C   sX   | j dkrdS | j}| }|stdt|dd\}}|dkr(td| d||fS )Nstatic)NNzKEndpoint is missing in endpoint. Try to add --master-addr and --master-portr'   )default_portzport is missing in endpoint: z. Try to specify --master-port)backendendpointstriprF   r   )rh   rl   master_addrmaster_portrI   rI   rJ   _get_addr_and_port   s   

rp   rV   c                 C   s  | j stt j}td| || _ t||}tdi d|d| j	d| j
d| jd| j d| jd	| jd
| jd| jd| jd| jjd| jd| jd| jd| jd| jd| j td"| j| j| j | j	| j
| jd| j}t|\}}| jtjd< d }d }	tt}
|
d urt dddrzt! }	t"|	t|
dd}|#  td|
 W n t$y   tjddd d }d }	Y nw t%| j&| j|t'|t()|| j| j||| j| j| j| j| j| j*d}t+|| j| j,| j-| j.|d }|	d ur|	/|j0 d}z^z0t12t13| j |4 }t56|7 | j |8 r#t9||j:d!|j;W W |r0|j<=  S S  t9y9     t>yL   d}t56|? | j   t$y]   t56|? | j  w |rg|j<=  w w )#Nz3config has no run_id, generated a random run_id: %saR  Starting elastic_operator with launch configs:
  entrypoint               : %(entrypoint)s
  min_nodes                : %(min_nodes)s
  max_nodes                : %(max_nodes)s
  nproc_per_node           : %(nproc_per_node)s
  run_id                   : %(run_id)s
  rdzv_backend             : %(rdzv_backend)s
  rdzv_endpoint            : %(rdzv_endpoint)s
  rdzv_configs             : %(rdzv_configs)s
  max_restarts             : %(max_restarts)s
  monitor_interval         : %(monitor_interval)s
  log_dir                  : %(log_dir)s
  metrics_cfg              : %(metrics_cfg)s
  event_log_handler        : %(event_log_handler)s
  numa_options             : %(numa_options)s
  signals_to_handle        : %(signals_to_handle)s
  duplicate_stdout_filters : %(duplicate_stdout_filters)s
  duplicate_stderr_filters : %(duplicate_stderr_filters)s
rW   r   r   r   r   r$   r"   r&   r*   r+   log_dirr/   r2   r3   r4   r5   r6   )rk   rl   r   r   r   r0   TORCHELASTIC_SIGNALS_TO_HANDLEzNai_infra/pytorch_distributed:torchelastic_enable_healthcheck_before_rendezvousF)default<   )alive_callbackportr9   z>Started early health check server on port %s before rendezvousz)Failed to start early health check serverT)exc_info)r!   local_world_sizerW   r\   rdzv_handlerr*   r+   rn   ro   r0   r2   r3   r5   r6   r7   )specr   r-   r.   r8   health_check_server)namefailuresrI   )@r   rQ   uuiduuid4rB   r@   warningrg   rA   r   r   r   r$   r"   r&   r*   r+   r   root_log_dirr/   r2   r3   r4   r5   r6   r   r0   rp   rC   rD   getenvr   r   r   r   start	Exceptionr
   r!   tuplerdzv_registryget_rendezvous_handlerr7   r   r-   r.   r8   set_delegate_get_alive_timer	   initialize_metricsMetricsConfigrunr   recordget_event_succeeded	is_failedr   r}   return_valuesry   shutdownr   get_event_failed)rV   rW   r\   r   entrypoint_namerh   rn   ro   r{   alive_callback_proxyhealthcheck_portrz   agentshutdown_rdzvresultrI   rI   rJ   r      s
  
	
(


	
r   ):rC   rd   r~   collections.abcr   dataclassesr   r   typingr   r<   -torch.distributed.elastic.rendezvous.registrydistributedelastic
rendezvousregistryr   torch._utils_internalr   r   torch.distributed.elasticr   r	   *torch.distributed.elastic.agent.server.apir
   :torch.distributed.elastic.agent.server.health_check_serverr   :torch.distributed.elastic.agent.server.local_elastic_agentr   r   r   )torch.distributed.elastic.multiprocessingr   r   r   0torch.distributed.elastic.multiprocessing.errorsr   $torch.distributed.elastic.rendezvousr   *torch.distributed.elastic.rendezvous.utilsr   'torch.distributed.elastic.utils.loggingr   torch.numa.bindingr   __all__rL   r@   r   r   rQ   rT   rg   r   rB   rp   rR   r   rI   rI   rI   rJ   <module>   sN   l"'


