Source code for polyglotdb.io.parsers.textgrid

import os

from praatio import textgrid
from praatio.utilities.errors import DuplicateTierName

from polyglotdb.exceptions import TextGridError
from polyglotdb.io.helper import find_wav_path
from polyglotdb.io.parsers.base import BaseParser, DiscourseData
from polyglotdb.io.types.parsing import Orthography, Transcription



[docs]
class TextgridParser(BaseParser):
    """
    Parser for Praat TextGrid files.

    Parameters
    ----------
    annotation_tiers: list
        Annotation types of the files to parse
    hierarchy : :class:`~polyglotdb.structure.Hierarchy`
        Details of how linguistic types relate to one another
    make_transcription : bool, defaults to True
        If true, create a word attribute for transcription based on segments
        that are contained by the word
    stop_check : callable, optional
        Function to check whether to halt parsing
    call_back : callable, optional
        Function to output progress messages
    """

    _extensions = [".textgrid"]

    def __init__(
        self,
        annotation_tiers,
        hierarchy,
        make_transcription=True,
        make_label=False,
        stop_check=None,
        call_back=None,
    ):
        super(TextgridParser, self).__init__(
            annotation_tiers,
            hierarchy,
            make_transcription=True,
            make_label=True,
            stop_check=stop_check,
            call_back=call_back,
        )

    def load_textgrid(self, path):
        """
        Load a TextGrid file

        Parameters
        ----------
        path : str
            Path to the TextGrid file

        Returns
        -------
        :class:`~praatio.textgrid.TextGrid`
            TextGrid object
        """
        try:
            tg = textgrid.openTextgrid(path, includeEmptyIntervals=True)
        except (AssertionError, ValueError, DuplicateTierName) as e:
            raise (TextGridError("The file {} could not be parsed: {}".format(path, str(e))))
        return tg

    def parse_discourse(self, path, types_only=False):
        """
        Parse a TextGrid file for later importing.

        Parameters
        ----------
        path : str
            Path to TextGrid file
        types_only : bool
            Flag for whether to only save type information, ignoring the token information

        Returns
        -------
        :class:`~polyglotdb.io.discoursedata.DiscourseData`
            Parsed data from the file
        """
        tg = self.load_textgrid(path)

        if len(tg.tierNames) != len(self.annotation_tiers):
            raise (
                TextGridError(
                    "The TextGrid ({}) does not have the same number of interval tiers as the number of annotation types specified.".format(
                        path
                    )
                )
            )
        name = os.path.splitext(os.path.split(path)[1])[0]

        if self.speaker_parser is not None:
            speaker = self.speaker_parser.parse_path(path)
        else:
            speaker = None

        for a in self.annotation_tiers:
            a.reset()
            a.speaker = speaker

        # Parse the tiers
        for i, tier_name in enumerate(tg.tierNames):
            ti = tg.getTier(tier_name)
            if isinstance(ti, textgrid.IntervalTier):
                self.annotation_tiers[i].add(
                    ((text.strip(), begin, end) for (begin, end, text) in ti.entries)
                )
            else:
                self.annotation_tiers[i].add(((text.strip(), time) for time, text in ti.entries))

        is_empty_textgrid = True

        for t in self.annotation_tiers:
            for interval in t:
                if isinstance(interval, Orthography):
                    if interval.label != "":
                        is_empty_textgrid = False
                        break
                if isinstance(interval, Transcription):
                    if interval._list != []:
                        is_empty_textgrid = False
                        break
        if is_empty_textgrid:
            return None

        pg_annotations = self._parse_annotations(types_only)

        data = DiscourseData(name, pg_annotations, self.hierarchy)
        for a in self.annotation_tiers:
            a.reset()
        data.wav_path = find_wav_path(path)
        return data