Source code for polyglotdb.io.parsers.textgrid

import os

from praatio import textgrid
from praatio.utilities.errors import DuplicateTierName

from polyglotdb.exceptions import TextGridError
from polyglotdb.io.helper import find_wav_path
from polyglotdb.io.parsers.base import BaseParser, DiscourseData
from polyglotdb.io.types.parsing import Orthography, Transcription


[docs] class TextgridParser(BaseParser): """ Parser for Praat TextGrid files. Parameters ---------- annotation_tiers: list Annotation types of the files to parse hierarchy : :class:`~polyglotdb.structure.Hierarchy` Details of how linguistic types relate to one another make_transcription : bool, defaults to True If true, create a word attribute for transcription based on segments that are contained by the word stop_check : callable, optional Function to check whether to halt parsing call_back : callable, optional Function to output progress messages """ _extensions = [".textgrid"] def __init__( self, annotation_tiers, hierarchy, make_transcription=True, make_label=False, stop_check=None, call_back=None, ): super(TextgridParser, self).__init__( annotation_tiers, hierarchy, make_transcription=True, make_label=True, stop_check=stop_check, call_back=call_back, ) def load_textgrid(self, path): """ Load a TextGrid file Parameters ---------- path : str Path to the TextGrid file Returns ------- :class:`~praatio.textgrid.TextGrid` TextGrid object """ try: tg = textgrid.openTextgrid(path, includeEmptyIntervals=True) except (AssertionError, ValueError, DuplicateTierName) as e: raise (TextGridError("The file {} could not be parsed: {}".format(path, str(e)))) return tg def parse_discourse(self, path, types_only=False): """ Parse a TextGrid file for later importing. Parameters ---------- path : str Path to TextGrid file types_only : bool Flag for whether to only save type information, ignoring the token information Returns ------- :class:`~polyglotdb.io.discoursedata.DiscourseData` Parsed data from the file """ tg = self.load_textgrid(path) if len(tg.tierNames) != len(self.annotation_tiers): raise ( TextGridError( "The TextGrid ({}) does not have the same number of interval tiers as the number of annotation types specified.".format( path ) ) ) name = os.path.splitext(os.path.split(path)[1])[0] if self.speaker_parser is not None: speaker = self.speaker_parser.parse_path(path) else: speaker = None for a in self.annotation_tiers: a.reset() a.speaker = speaker # Parse the tiers for i, tier_name in enumerate(tg.tierNames): ti = tg.getTier(tier_name) if isinstance(ti, textgrid.IntervalTier): self.annotation_tiers[i].add( ((text.strip(), begin, end) for (begin, end, text) in ti.entries) ) else: self.annotation_tiers[i].add(((text.strip(), time) for time, text in ti.entries)) is_empty_textgrid = True for t in self.annotation_tiers: for interval in t: if isinstance(interval, Orthography): if interval.label != "": is_empty_textgrid = False break if isinstance(interval, Transcription): if interval._list != []: is_empty_textgrid = False break if is_empty_textgrid: return None pg_annotations = self._parse_annotations(types_only) data = DiscourseData(name, pg_annotations, self.hierarchy) for a in self.annotation_tiers: a.reset() data.wav_path = find_wav_path(path) return data