from os.path import dirname, join
import math
import numpy as np
from scipy.signal import resample, lfilter
from hdf5storage import loadmat
from naplib import logger
# read in cochlear filter from file
COCHBA = loadmat(
join(dirname(__file__), 'cochba.mat'), variable_names=['COCHBA']
)['COCHBA']
def _sigmoid(x, factor):
'''
Nonlinear sigmoid for cochlear model which simulates hair
cell nonlinearity.
Parameters
----------
x : float
Data to pass through nonlinear function
factor: float or string
Nonlinear factor.
If a number > 0, transistor-like function.
If 'boolean', hard-limiter
If 'half-wave', half-wave rectifier
Else, no operation and x is returned unchanged, i.e. linear
'''
if isinstance(factor, float):
return 1 / (1 + np.exp(-x / factor))
if factor == 'boolean':
return (x > 0).astype('float')
elif factor == 'half-wave':
return np.maximum(x, 0)
else:
return x
[docs]
def auditory_spectrogram(x, sfreq, frame_len=8, tc=4, factor='linear'):
'''
Compute the auditory spectrogram of a signal using a model of the
peripheral auditory system [#1yang]_. The model includes a cochlear
filter bank of 128 constant-Q filters logarithmically-spaced, a hair
cell model which includes a low-pass filter and a nonlinear compression
function, and a lateral inhibitory network over the spectral axis. The
envelope of each frequency band gives the time-frequency representation.
Parameters
----------
x : np.ndarray
Acoustic signal to convert to time-frequency representation
sfreq : int
Sampling rate of the signal. This function is meant to be used on a signal
with 16KHz sampling rate. If sfreq is different, it resamples the audio to 16KHz.
frame_len : float, default=8
Frame length of the output, in ms. Typically 8, 16, or a power of 2.
tc : float, default=4
Time constant for leaky integration. Must be >=0. If 0, then leaky
integration becomes short-term average. Typically 4, 16, or 64 ms.
factor : float or string, default='linear'
Nonlinear factor for hair cell model. If a positive float, specifies the
critical level factor (typically 0.1 for a unit sequence). The smaller the
value, the more the compression. Or, can be one of
{'linear', 'boolean', 'half-wave'} describing the compressor.
Returns
-------
aud : np.ndarray
Auditory spectrogram, shape (n_frames, 128).
Notes
-----
This is a python re-implementation of the Matlab function `wav2aud` in the
`NSLtools toolbox <https://github.com/tel/NSLtools>`_.
For correct performance, x should be a float array.
References
----------
.. [#1yang] X. Yang, K. Wang and S. A. Shamma, "Auditory representations of
acoustic signals," in IEEE Transactions on Information Theory, vol.
38, no. 2, pp. 824-839, March 1992, doi: 10.1109/18.119739.
'''
if frame_len <= 0:
raise ValueError(f'frame_len must be positive but got {frame_len}')
if tc < 0:
raise ValueError(f'time constant (tc) must be nonnegative but got {tc}')
if isinstance(factor, (float, int)) and factor <= 0:
raise ValueError(f'If a float, factor must be positive, but got {factor}')
if isinstance(factor, str) and factor not in ['linear', 'boolean', 'half-wave']:
raise ValueError(f"If a string, factor must be one of {'linear', 'boolean', 'half-wave'}, but got '{factor}'")
x = x.squeeze()
# If x is not sampled at 16KHz, resample
if sfreq != 16_000:
logger.warning(f"Resampling audio from {round(sfreq)/1000:g}KHz to 16KHz")
x, sfreq = resample(x, round(len(x) / sfreq * 16_000)), 16_000
L_x = x.shape[0]
L_frm = round(frame_len * 2**4) # frame length (points)
if tc > 0:
alpha = math.exp(-1 / (tc * 2**4)) # decay factor
alpha_filt = np.array([1, -alpha])
else:
alpha = 0 # this is now just short-term average
# hair cell time constant in ms
haircell_tc = 0.5
beta = math.exp(-1 / (haircell_tc * 2**4))
low_pass_filt = np.array([1, -beta])
# allocate memory for output
N = math.ceil(L_x / L_frm)
M = COCHBA.shape[1]
x = np.pad(x, (0, N*L_frm-L_x))
indexer = np.arange(L_frm-1, N*L_frm, L_frm)
output = np.zeros((N, M-1), dtype='float32')
# do last channel first (highest frequency)
p = round(np.real(COCHBA[0, M-1]))
B = np.real(COCHBA[1:p+2, M-1])
A = np.imag(COCHBA[1:p+2, M-1])
y = lfilter(B, A, x).squeeze()
y = _sigmoid(y, factor)
# hair cell membrane (low-pass <= 4kHz), ignored for linear ionic channels
if factor != 'linear':
y = lfilter(np.array([1.]), low_pass_filt, y).squeeze()
y_h = y
for ch in range(M-2, -1, -1):
# cochlear filterbank
p = round(np.real(COCHBA[0, ch]))
B = np.real(COCHBA[1:p+2, ch])
A = np.imag(COCHBA[1:p+2, ch])
y = lfilter(B, A, x).squeeze()
# transduction hair cells
y = _sigmoid(y, factor)
# hair cell membrane (low-pass <= 4kHz), ignored for linear ionic channels
if factor != 'linear':
y = lfilter(np.array([1.]), low_pass_filt, y).squeeze()
# reduction: lateral inhibitory network
# masked by higher (frequency) spatial response
y, y_h = y - y_h, y
# half-wave rectifier
y = np.maximum(y, 0)
# temporal integration
if alpha != 0:
y = lfilter(np.array([1.]), alpha_filt, y).squeeze()
output[:, ch] = y[indexer]
else:
if L_frm == 1:
output[:, ch] = y
else:
output[:, ch] = np.mean(np.reshape(y, (L_frm, N)), axis=0)
return output