Source code for polyglotdb.io.parsers.labbcat

from collections import Counter

from praatio import textgrid

from polyglotdb.io.parsers.aligner import AlignerParser
from polyglotdb.io.parsers.speaker import DirectorySpeakerParser


[docs] class LabbCatParser(AlignerParser): """ Parser for TextGrids exported from LaBB-CAT Parameters ---------- annotation_tiers : list List of the annotation tiers to store data from the TextGrid hierarchy : Hierarchy Basic hierarchy of the TextGrid make_transcription : bool Flag for whether to add a transcription property to words based on phones they contain stop_check : callable Function to check for whether parsing should stop call_back : callable Function to report progress in parsing """ name = "LabbCat" word_label = "transcript" phone_label = "segment" speaker_first = False def __init__( self, annotation_tiers, hierarchy, make_transcription=True, stop_check=None, call_back=None, ): super(AlignerParser, self).__init__( annotation_tiers, hierarchy, make_transcription, False, stop_check, call_back, ) self.speaker_parser = DirectorySpeakerParser() def load_textgrid(self, path): """ Load a TextGrid file. Additionally ignore duplicated tier names as they can sometimes be exported erroneously from LaBB-CAT. Parameters ---------- path : str Path to the TextGrid file Returns ------- :class:`~textgrid.TextGrid` TextGrid object """ try: tg = textgrid.openTextgrid(path, includeEmptyIntervals=True) new_tiers = [] dup_tiers_maxes = { k: 0 for k, v in Counter([t for t in tg.tierNames]).items() if v > 1 } dup_tiers_inds = {k: 0 for k in dup_tiers_maxes.keys()} for i, t in enumerate(tg.tierNames): if t in dup_tiers_maxes: if len(t) > dup_tiers_maxes[t]: dup_tiers_maxes[t] = len(t) dup_tiers_inds[t] = i for i, t in enumerate(tg.tierNames): if t in dup_tiers_maxes: if i != dup_tiers_inds[t]: continue new_tiers.append(t) tg.tiers = new_tiers return tg except Exception: print(f"There was an issue parsing: {path}") raise