o
    SQ8j                     @   s6   d dl Zd dlZd dlZd dlmZ G dd dZdS )    N)dequec                   @   sd   e Zd ZdZejejeje	ddddfde
defdd	Zdd
dZdddZdddZdS )VADz
    A model class for a voice activity detection (VAD) based on Silero's model:

    https://github.com/snakers4/silero-vad
    	resourcesmodelszsilero_vad.onnx   
model_path	n_threadsc                 C   sT   t  }||_||_t j||dgd| _tdd| _t	d
tj| _|   dS )zInitialize the VAD model object.

            Args:
                model_path (str): The path to the Silero VAD ONNX model.
                n_threads (int): The number of threads to use for the VAD model.
        CPUExecutionProvider)sess_options	providers}   )maxleni>  N)ortSessionOptionsinter_op_num_threadsintra_op_num_threadsInferenceSessionmodelr   prediction_buffernparrayastypeint64sample_ratereset_states)selfr   r   sessionOptions r   W/home/nk/hobo-godmode/plappi-mvp/.venv/lib/python3.10/site-packages/openwakeword/vad.py__init__<   s   zVAD.__init__c                 C   s@   t d|dfd| _t d|dfd| _d| _d| _d S )N   @   float32r   )r   zerosr   _h_c_last_sr_last_batch_size)r   
batch_sizer   r   r   r   \   s   
zVAD.reset_states  c           	         s    fddt djd  D }g }|D ]%}|d | j| j| jd}| jd|}|\}| _| _||d d  qt	|S )aI  
        Get the VAD predictions for the input audio frame.

        Args:
            x (np.ndarray): The input audio, must be 16 khz and 16-bit PCM format.
                            If longer than the input frame, will be split into
                            chunks of length `frame_size` and the predictions for
                            each chunk returned. Must be a length that is integer
                            multiples of the `frame_size` argument.
            frame_size (int): The frame size in samples. The reccomended
                              default is 480 samples (30 ms @ 16khz),
                              but smaller and larger values
                              can be used (though performance may decrease).

        Returns
            float: The average predicted score for the audio frame
        c                    s(   g | ]}||   d   tjqS )i  )r   r   r"   ).0i
frame_sizexr   r   
<listcomp>t   s     zVAD.predict.<locals>.<listcomp>r   N)inputhcsrN)
rangeshaper$   r%   r   r   runappendr   mean)	r   r.   r-   chunksframe_predictionschunk
ort_inputsort_outsoutr   r,   r   predictb   s   
zVAD.predict  c                 C   s   | j | || d S r0   )r   r8   r@   )r   r.   r-   r   r   r   __call__   s   zVAD.__call__N)r   )r)   )rA   )__name__
__module____qualname____doc__ospathjoindirnameabspath__file__strintr   r   r@   rB   r   r   r   r   r   6   s"    

 
r   )onnxruntimer   numpyr   rG   collectionsr   r   r   r   r   r   <module>   s
   /