Source code for naplib.features.aligner

import os
from os.path import join, isdir, dirname
import unicodedata
import string
import shutil
import subprocess

import numpy as np
from scipy.io.wavfile import write as write_wavfile
from scipy.io.wavfile import read as read_wavfile
from scipy.signal import resample as scipy_resample

from naplib import logger
from .alignment_extras import create_wrd_dict, get_phoneme_label_vector, get_word_label_vector
from ..utils import _parse_outstruct_args
from ..data import Data
from .prosodylab_aligner import run_aligner



[docs]
class Aligner():
    '''
    This class performs phoneme and word alignment using audio files
    and matching text files containing scripts. If words in the texts do not
    appear in the dict file, you will need to add them to a dict file and specify
    it as ``dictionary_file``.

    Note
    ----
    Several extra packages are required to perform alignment. Please follow the
    installation instructions for ``HTK`` and ``sox`` for your system before
    using alignment. Additionally, you will need to install
    `pyyaml <https://pypi.org/project/PyYAML/>`_ as well as
    `TextGrid <https://pypi.org/project/TextGrid/>`_ for the Aligner to work. These are
    not required dependencies of naplib-python, so they must be installed separately.
    
    Parameters
    ----------
    output_dir : string, path-like
            Directory to put output files in, such as .phn, .wrd., and .TextGrid files.
    dictionary_file : string, path-like, optional
        Path to a dictionary file (e.g. eng.dict) which contains phonemes for
        all words in corpus. If not provided, will use the default eng.dict.
        For an example file, see
        `ProsodyLab's eng.dict <https://github.com/prosodylab/Prosodylab-Aligner/blob/master/eng.dict>`_ 
    tmp_dir : string, path-like, optional
        Directory to hold temporary files that are created. If not provided, creates
        a folder called `data_/` in the current working directory and uses that.
    '''
    def __init__(self, output_dir, dictionary_file=None, tmp_dir=None):

        os.makedirs(output_dir, exist_ok=True)
        if tmp_dir is None:
            tmp_dir = 'data_/'
            # check if this folder already exists, in which case throw an error
            # so we don't overwrite a folder by default.
            if isdir(tmp_dir):
                raise ValueError(f'No tmp_dir was provided, but could not use '
                    'the default "data_/" because a folder with that name '
                    'already exists in the current path. Please remove that '
                    'directory or explicitly specify the tmp_dir parameter.')
        self.output_dir = output_dir
        self.tmp_dir = tmp_dir
        self.filedir_ = dirname(__file__)
        if dictionary_file is None:
            dictionary_file = join(self.filedir_, 'prosodylab_aligner', 'eng.dict')
        self.dictionary_file = dictionary_file

        try:
            import yaml
        except Exception:
            raise Exception('Missing package pyyaml which is required for alignment. Please '
                'install it with "pip install pyyaml"')
        try:
            import textgrid
        except Exception:
            raise Exception('Missing package TextGrid which is required for alignment. Please '
                'install it with "pip install TextGrid"')


    def _remove_nonword_characters_and_punctuation_and_capitalize(self, s):
        exclude = set(string.punctuation)
        exclude.remove("'")
        s = ''.join(ch for ch in s if ch not in exclude)
        # s = s.translate(str.maketrans('', '', string.punctuation))
        s = s.upper()
        return s

    def _convert_text_to_ascii(self, name, root):
        new_name = name.replace('.txt', '.lab')
        new_folder = self.tmp_dir

        unicode_file = open(os.path.join(root, name))
        unicode_data = unicode_file.read() #.decode(input_codec)
        unicode_data = self._remove_nonword_characters_and_punctuation_and_capitalize(unicode_data)
        ascii_data = unicodedata.normalize('NFKD', unicode_data).encode('ascii','ignore')
        ascii_file = open(os.path.join(new_folder, new_name), 'wb')
        ascii_file.write(ascii_data)


[docs]
    def align(self, data=None, name='name', sound='sound',
              soundf='soundf', transcript='transcript',
              dataf='dataf', length='length'):
        '''
        Perform alignment across a set of paired audio-text files stored
        in fields of a Data object. This function will create a set of
        .TextGrid files, as well as corresponding .phn and .wrd
        files in the output_dir which describe the timing of phonemes and
        words within each audio. These files can be used in conjunction
        with the other functions in `naplib.alignment`, such as
        ``get_phoneme_label_vector`` and ``get_word_label_vector``,
        which take these files as input. This function will automatically
        use ``naplib.alignment.get_phoneme_label_vector`` and
        ``naplib.alignment.get_word_label_vector`` to produce phoneme and
        word label vectors for each stimulus which can be placed into the
        Data object and further analyzed.

        This function is essentially equivalent to storing audio and text
        in directories and using ``Aligner.align_files`` followed by
        ``Aligner.get_label_vecs_from_files``.

        Parameters
        ----------
        data : Data instance
            Data object containing the data to align. It must contain the
            following fields. 
        name : string or list of strings, default='name'
            If a string, specifies a field of the Data which contains
            the name for each trial. Otherwise, a list of strings specifies
            the name for each trial.
        sound : string or list of np.ndarrays, default='sound'
            If a string, specifies a field of the Data which contains
            the sound waveform for each trial. Otherwise, a list of np.ndarrays
            specifies the waveform for each trial.
        soundf : string, integer, or list of integers, default='soundf'
            If a string, specifies a field of the Data which contains
            the sampling rate for each trial. Otherwise, a list of integers
            specifies the sampling rate for each trial, or a single integer gives the
            sampling rate for all trials.
        transcript : string or list of strings, default='transcript'
            If a string, specifies a field of the Data which contains
            the transcript text for each trial. Otherwise, a list of strings
            specifies the transcript text for each trial.
        dataf : string, integer, or list of integers, default='dataf'
            If a string, specifies a field of the Data which contains
            the desired sampling rate of the output. Otherwise, a list of integers
            specifies the Desired sampling rate of the output for each trial, or
            a single integer gives the desired sampling rate of the output
            for all trials.
        length : string or list of integers, default='length'
            If a string, specifies a field of the Data which contains
            the desired output length (in samples) for each trial. Otherwise,
            a list of integers specifies the desired output length (in samples)
            for each trial.

        Returns
        -------
        alignment_data: Data instance
            Data object containing all alignment information, with all the fields
            described by the return values below. 
        phn_labels : list of np.ndarrays
            Phoneme label vector for each trial. alignment_data['phn_labels'][i]
            is a np.ndarray of shape (time,) and sampling rate dataf[i].
        manner_labels : list of np.ndarrays
            Manner of articulation label vector for each trial.
            alignment_data['manner_labels'][i]
            is a np.ndarray of shape (time,) and sampling rate dataf[i].
        wrd_labels : list of np.ndarrays
            Word label vector for each trial. alignment_data['wrd_labels'][i]
            is a np.ndarray of shape (time,) and sampling rate dataf[i].
        phn_label_list : list of lists of strings
            Phoneme label list returned by ``naplib.alignment.get_phoneme_label_vector``,
            so alignment_data['phn_label_list'][i] is a list of phonemes, where the
            index of a given phoneme in the list encodes that phoneme's label in ``phn_labels``.
        manner_label_list : list of lists of strings
            Manner of articulation label list returned by ``naplib.alignment.get_phoneme_label_vector``,
            so alignment_data['manner_label_list'][i] is a list of manners, where the
            index of a given manner in the list encodes that manner's label in ``manner_labels``.
        wrd_dict : dict
            Dictionary of word:int (key:value) pairs for all the words in the corpus
            of files in the directory, created by ``naplib.alignment.create_wrd_dict``
            So, alignment_data['wrd_dict'][i] is a dictionary
            which maps a word to its integer value as it is represented in ``wrd_labels``.

        Note
        ----
        This function will produce the following files in the output_dir to aid in
        its running.

        | working directory
        | └── output_dir
        | │   └── trial1.phn
        | │   └── trial1.wrd
        | │   └── trial1.TextGrid
        | │   └── trial2.phn
        | │   └── trial2.wrd
        | │   └── trial2.TextGrid
        '''

        names, sounds, soundf, transcripts, dataf, lengths = _parse_outstruct_args(data,
                                                                                            name,
                                                                                            sound,
                                                                                            soundf,
                                                                                            transcript,
                                                                                            dataf,
                                                                                            length, allow_strings_without_outstruct=False)

        # Write sounds to wav files in tmp folder and text to .txt files
        audio_dir = join(self.tmp_dir, 'tmp_sounds')
        os.makedirs(audio_dir, exist_ok=False)
        text_dir = join(self.tmp_dir, 'tmp_text')
        os.makedirs(text_dir, exist_ok=False)
        for name, soundwave, soundf_, script in zip(names, sounds, soundf, transcripts):
            fname_wav = join(audio_dir, f'{name}.wav')
            write_wavfile(fname_wav, int(soundf_), soundwave)
            fname_txt = join(text_dir, f'{name}.txt')
            with open(fname_txt, "w") as text_file:
                text_file.write(script)
                text_file.close()

        # Align text and audio from files
        self.align_files(audio_dir, text_dir, names=names)
        
        shutil.rmtree(audio_dir, ignore_errors=True)
        shutil.rmtree(text_dir, ignore_errors=True)

        # Get the label vectors from the alignment files
        return self.get_label_vecs_from_files(data=data, name=names,
                                  dataf=dataf, length=lengths,
                                  befaft=np.array([0, 0]))




[docs]
    def align_files(self, audio_dir, text_dir, names=None):
        '''
        Perform alignment across a set of paired audio-text files stored
        in directories. This function will create a set of .TextGrid files,
        as well as corresponding .phn and .wrd
        files in the output_dir which describe the timing of phonemes and
        words within each audio. These files can be used in conjunction
        with the other functions in `naplib.alignment`, such as
        ``get_phoneme_label_vector`` and ``get_word_label_vector``,
        which take these files as input.

        Parameters
        ----------
        audio_dir : string, path-like
            Directory containing audio files (.wav).
        text_dir : string, path-like
            Directory containing text files (.txt) with matching names
            to the files in ``audio_dir``.
        names : list of strings, optional
            List of names (without file-type) which specify a subset of files within
            .the audio_dir and text_dir to process.

        Note
        ----
        The directory structure containing audios and matching text
        files must be correct in order to properly perform alignment.
        See below for what the directory layout should look like
        before running this function.

        | working directory
        | ├── audio_dir
        | │   ├── file1.wav
        | │   ├── file2.wav
        | └── text_dir
        | │   └── file1.txt
        | │   └── file2.txt 

        After running this function, the directory layout will look
        like this:

        | working directory
        | ├── audio_dir
        | │   ├── file1.wav
        | │   ├── file2.wav
        | └── text_dir
        | │   └── file1.txt
        | │   └── file2.txt
        | └── output_dir
        | │   └── file1.phn
        | │   └── file1.wrd
        | │   └── file1.TextGrid
        | │   └── file2.phn
        | │   └── file2.wrd
        | │   └── file2.TextGrid
        '''        
        import textgrid

        if names is not None and not isinstance(names, list):
            raise TypeError(f'names argument must be a list, or None, but got {type(names)}')

        logger.info(f'Resampling audio and putting in {self.tmp_dir} directory...')

        resample_path = join(self.filedir_, 'resample.sh')

        # resample the audios to 16000 and put them in the tmp data folder
        # if sox is installed use that, otherwise use scipy
        try:
            wavefilepath_ = join(self.filedir_, 'test.wav')
            subprocess.run(['sox', wavefilepath_, wavefilepath_], check=True, capture_output=True)
            os.system(f'{resample_path} -s 16000 -r {audio_dir} -w {self.tmp_dir}')
        except (OSError, subprocess.SubprocessError, subprocess.CalledProcessError):
            logger.warning('Could not find sox. Using scipy to resample and save .wav files instead')
            # don't have sox, so use scipy instead
            wavfiles = [fname_ for fname_ in os.listdir(audio_dir) if fname_.endswith(".wav")]
            for wavfile_ in wavfiles:
                old_fs, wavdata = read_wavfile(join(audio_dir, wavfile_))
                if old_fs == 16000:
                    write_wavfile(join(self.tmp_dir, wavfile_), 16000, wavdata)
                else:
                    wavdata = scipy_resample(wavdata, int(len(wavdata) * 16000. / old_fs))
                    write_wavfile(join(self.tmp_dir, wavfile_), 16000, wavdata)
            

        logger.info(f'Converting text files to ascii in {self.tmp_dir} directory...')

        for root, _, files in os.walk(text_dir, topdown=False):
            for name in files:
                if '.txt' in name:
                    self._convert_text_to_ascii(name, root)

        logger.info('Performing alignment...')

        # perform alignment using ProsodyLab-Aligner
        eng_zip_file = join(self.filedir_, 'prosodylab_aligner', 'eng.zip')
        run_aligner(align=self.tmp_dir, dictionary=[self.dictionary_file], read=eng_zip_file)

        logger.info(f'Converting .TextGrid files to .phn and .wrd in {self.output_dir}')

        # Convert textgrid files to .phn and .wrd files in output_dir
        for root, _, files in os.walk(self.tmp_dir, topdown=False):
            for name in files:
                if '.TextGrid' in name:

                    if names is not None and name.split('.TextGrid')[0] not in names:
                        continue

                    # copy TextGrid file to output_dir so they are saved
                    os.system(f'cp {join(root, name)} {join(self.output_dir, name)}')

                    new_phn_name = name.replace('.TextGrid', '.phn')
                    new_wrd_name = name.replace('.TextGrid', '.wrd')

                    tg = textgrid.TextGrid.fromFile(join(root, name))
                    phones = tg[0]
                    words = tg[1]

                    # write phn file

                    phn_file = open(os.path.join(self.output_dir, new_phn_name), 'w')

                    for phone_seg in phones:
                        if phone_seg.mark == "":
                            phone_seg.mark = "sp"
                        if phone_seg.mark != "sil":
                            print(f"{phone_seg.minTime} {phone_seg.maxTime} {phone_seg.mark}", file=phn_file)

                    phn_file.close()

                    # write wrd file

                    wrd_file = open(os.path.join(self.output_dir, new_wrd_name), 'w')

                    for word_seg in words:
                        if word_seg.mark != "sil":
                            print(f"{word_seg.minTime} {word_seg.maxTime} {word_seg.mark}", file=wrd_file)

                    wrd_file.close()

            logger.info('Finished creating alignment files.')



[docs]
    def get_label_vecs_from_files(self, data=None, name='name',
                                  dataf='dataf', length='length',
                                  befaft='befaft'):
        '''

        Parameters
        ----------
        data : Data instance
            Data object containing the data to align. It must contain the
            following fields. 
        name : string or list of strings, default='name'
            If a string, specifies a field of the Data which contains
            the name for each trial. Otherwise, a list of strings specifies
            the name for each trial.
        dataf : string, integer, or list of integers, default='dataf'
            If a string, specifies a field of the Data which contains
            the desired sampling rate of the output. Otherwise, a list of integers
            specifies the Desired sampling rate of the output for each trial, or
            a single integer gives the desired sampling rate of the output
            for all trials.
        length : string or list of integers, default='length'
            If a string, specifies a field of the Data which contains
            the desired output length (in samples) for each trial. Otherwise,
            a list of integers specifies the desired output length (in samples)
            for each trial.
        befaft : string or list of np.ndarrays, or a single np.ndarray, default='befaft'
            If a string, specifies a field of the Data which contains
            the before and after time (in sec) for each trial. Otherwise,
            a list should contain the befaft period for each trial, and a single
            np.ndarray of length 2 specifies the befaft period for all trials. For
            example, befaft=np.array([0.5, 0.5]) indicates that for each trial, the
            wav file which was used to produce the alignment is 0.5 seconds shorter
            at the beginning and 0.5 seconds shorter at the end than the desired
            output length.

        Returns
        -------
        alignment_data: Data instance
            Data object containing all alignment information, with all the fields
            described by the return values below. 
        phn_labels : list of np.ndarrays
            Phoneme label vector for each trial. alignment_data['phn_labels'][i]
            is a np.ndarray of shape (time,) and sampling rate dataf[i].
        manner_labels : list of np.ndarrays
            Manner of articulation label vector for each trial.
            alignment_data['manner_labels'][i]
            is a np.ndarray of shape (time,) and sampling rate dataf[i].
        wrd_labels : list of np.ndarrays
            Word label vector for each trial. alignment_data['wrd_labels'][i]
            is a np.ndarray of shape (time,) and sampling rate dataf[i].
        phn_label_list : list of lists of strings
            Phoneme label list returned by ``naplib.alignment.get_phoneme_label_vector``,
            so alignment_data['phn_label_list'][i] is a list of phonemes, where the
            index of a given phoneme in the list encodes that phoneme's label in ``phn_labels``.
        manner_label_list : list of lists of strings
            Manner of articulation label list returned by ``naplib.alignment.get_phoneme_label_vector``,
            so alignment_data['manner_label_list'][i] is a list of manners, where the
            index of a given manner in the list encodes that manner's label in ``manner_labels``.
        wrd_dict : dict
            Dictionary of word:int (key:value) pairs for all the words in the corpus
            of files in the directory, created by ``naplib.alignment.create_wrd_dict``
            So, alignment_data['wrd_dict'][i] is a dictionary
            which maps a word to its integer value as it is represented in ``wrd_labels``.

        Note
        ----
        This function requires that the following files ALREADY exist in the aligner's
        output_dir.

        | working directory
        | └── output_dir
        | │   └── trial1.phn
        | │   └── trial1.wrd
        | │   └── trial1.TextGrid
        | │   └── trial2.phn
        | │   └── trial2.wrd
        | │   └── trial2.TextGrid
        '''
        names, dataf, lengths, befafts = _parse_outstruct_args(data, name, dataf, length, befaft, allow_strings_without_outstruct=False)

        for len_ in lengths:
            if not isinstance(len_, int):
                raise TypeError(f'Each length must be an integer but found {type(len_)}')

        wrd_dict = create_wrd_dict(self.output_dir)

        alignment_results = []

        logger.info(f'Creating label vectors for phonemes, manner of articulation, and words.')

        for n in range(len(names)):

            this_trial_result = {}
    
            # filenames for the .phn and .wrd files
            filename_phn = join(self.output_dir, f'{names[n]}.phn')
            filename_wrd = join(self.output_dir, f'{names[n]}.wrd')
            
            # desired length of the output label vector
            length = lengths[n]
            
            # sampling rate of our data
            fs = dataf[n]
            
            # before-after period for our data is 0 since we are using a Data object where the
            # durations of sound and output should already be matched
            befaft = befafts[n]
            
            
            # compute label vectors for phonemes, manner of articulation, and words, for this trial
            label_vec_phn, phn_label_list = get_phoneme_label_vector(filename_phn, length, fs, befaft, return_label_lists=True)
            label_vec_manner, manner_label_list = get_phoneme_label_vector(filename_phn, length, fs, befaft, mode='manner', return_label_lists=True)
            label_vec_wrd = get_word_label_vector(filename_wrd, length, fs, befaft, wrd_dict=wrd_dict)

            this_trial_result['phn_labels'] = label_vec_phn
            this_trial_result['manner_labels'] = label_vec_manner
            this_trial_result['wrd_labels'] = label_vec_wrd
            this_trial_result['phn_label_list'] = phn_label_list
            this_trial_result['manner_label_list'] = manner_label_list
            this_trial_result['wrd_dict'] = wrd_dict

            alignment_results.append(this_trial_result)
            
        # Add the computed label vectors to the Data
        return Data(alignment_results, strict=False)