Source code for naplib.features.auditory_spectrogram

from os.path import dirname, join
import math
import numpy as np
from scipy.signal import resample, lfilter
from hdf5storage import loadmat

from naplib import logger


# read in cochlear filter from file
COCHBA = loadmat(
    join(dirname(__file__), 'cochba.mat'), variable_names=['COCHBA']
)['COCHBA']


def _sigmoid(x, factor):
    '''
    Nonlinear sigmoid for cochlear model which simulates hair
    cell nonlinearity.
    
    Parameters
    ----------
    x : float
        Data to pass through nonlinear function
    factor: float or string
        Nonlinear factor.
        If a number > 0, transistor-like function. 
        If 'boolean', hard-limiter
        If 'half-wave', half-wave rectifier
        Else, no operation and x is returned unchanged, i.e. linear
    '''
    if isinstance(factor, float):
        return 1 / (1 + np.exp(-x / factor))
    if factor == 'boolean':
        return (x > 0).astype('float')
    elif factor == 'half-wave':
        return np.maximum(x, 0)
    else:
        return x



[docs]
def auditory_spectrogram(x, sfreq, frame_len=8, tc=4, factor='linear'):
    '''
    Compute the auditory spectrogram of a signal using a model of the
    peripheral auditory system [#1yang]_. The model includes a cochlear
    filter bank of 128 constant-Q filters logarithmically-spaced, a hair
    cell model which includes a low-pass filter and a nonlinear compression
    function, and a lateral inhibitory network over the spectral axis. The
    envelope of each frequency band gives the time-frequency representation.
    
    Parameters
    ----------
    x : np.ndarray
        Acoustic signal to convert to time-frequency representation
    sfreq : int
        Sampling rate of the signal. This function is meant to be used on a signal
        with 16KHz sampling rate. If sfreq is different, it resamples the audio to 16KHz.
    frame_len : float, default=8
        Frame length of the output, in ms. Typically 8, 16, or a power of 2.
    tc : float, default=4
        Time constant for leaky integration. Must be >=0. If 0, then leaky
        integration becomes short-term average. Typically 4, 16, or 64 ms.
    factor : float or string, default='linear'
        Nonlinear factor for hair cell model. If a positive float, specifies the
        critical level factor (typically 0.1 for a unit sequence). The smaller the
        value, the more the compression. Or, can be one of
        {'linear', 'boolean', 'half-wave'} describing the compressor.
    
    Returns
    -------
    aud : np.ndarray
        Auditory spectrogram, shape (n_frames, 128).
    
    Notes
    -----
    This is a python re-implementation of the Matlab function `wav2aud` in the
    `NSLtools toolbox <https://github.com/tel/NSLtools>`_.

    For correct performance, x should be a float array.
    
    References
    ----------
    .. [#1yang] X. Yang, K. Wang and S. A. Shamma, "Auditory representations of
        acoustic signals," in IEEE Transactions on Information Theory, vol.
        38, no. 2, pp. 824-839, March 1992, doi: 10.1109/18.119739.
    
    '''
    
    if frame_len <= 0:
        raise ValueError(f'frame_len must be positive but got {frame_len}')
    
    if tc < 0:
        raise ValueError(f'time constant (tc) must be nonnegative but got {tc}')
    
    if isinstance(factor, (float, int)) and factor <= 0:
        raise ValueError(f'If a float, factor must be positive, but got {factor}')
    if isinstance(factor, str) and factor not in ['linear', 'boolean', 'half-wave']:
        raise ValueError(f"If a string, factor must be one of {'linear', 'boolean', 'half-wave'}, but got '{factor}'")
    
    x = x.squeeze()

    # If x is not sampled at 16KHz, resample
    if sfreq != 16_000:
        logger.warning(f"Resampling audio from {round(sfreq)/1000:g}KHz to 16KHz")
        x, sfreq = resample(x, round(len(x) / sfreq * 16_000)), 16_000

    L_x = x.shape[0]
    L_frm = round(frame_len * 2**4) # frame length (points)
    
    if tc > 0:
        alpha = math.exp(-1 / (tc * 2**4)) # decay factor
        alpha_filt = np.array([1, -alpha])
    else:
        alpha = 0 # this is now just short-term average
    
    # hair cell time constant in ms
    haircell_tc = 0.5
    beta = math.exp(-1 / (haircell_tc * 2**4))
    low_pass_filt = np.array([1, -beta])
    
    # allocate memory for output
    N = math.ceil(L_x / L_frm)
    M = COCHBA.shape[1]
    x = np.pad(x, (0, N*L_frm-L_x))
    indexer = np.arange(L_frm-1, N*L_frm, L_frm)
    output = np.zeros((N, M-1), dtype='float32')
    
    # do last channel first (highest frequency)
    
    p = round(np.real(COCHBA[0, M-1]))
    B = np.real(COCHBA[1:p+2, M-1])
    A = np.imag(COCHBA[1:p+2, M-1])
    
    y = lfilter(B, A, x).squeeze()
    y = _sigmoid(y, factor)
    
    # hair cell membrane (low-pass <= 4kHz), ignored for linear ionic channels
    if factor != 'linear':
        y = lfilter(np.array([1.]), low_pass_filt, y).squeeze()
    
    y_h = y
    
    for ch in range(M-2, -1, -1):
        
        # cochlear filterbank
        p = round(np.real(COCHBA[0, ch]))
        B = np.real(COCHBA[1:p+2, ch])
        A = np.imag(COCHBA[1:p+2, ch])
        y = lfilter(B, A, x).squeeze()
        
        # transduction hair cells
        y = _sigmoid(y, factor)
        
        # hair cell membrane (low-pass <= 4kHz), ignored for linear ionic channels
        if factor != 'linear':
            y = lfilter(np.array([1.]), low_pass_filt, y).squeeze()
        
        # reduction: lateral inhibitory network
        # masked by higher (frequency) spatial response
        y, y_h = y - y_h, y
        
        # half-wave rectifier
        y = np.maximum(y, 0)
        
        # temporal integration
        if alpha != 0:
            y = lfilter(np.array([1.]), alpha_filt, y).squeeze()
            output[:, ch] = y[indexer]
        else:
            if L_frm == 1:
                output[:, ch] = y
            else:
                output[:, ch] = np.mean(np.reshape(y, (L_frm, N)), axis=0)
    
    return output