Source code for polyglotdb.io.parsers.labbcat

from collections import Counter
from .aligner import AlignerParser

from polyglotdb.io.parsers.speaker import DirectorySpeakerParser
from praatio import textgrid


[docs] class LabbCatParser(AlignerParser): """ Parser for TextGrids exported from LaBB-CAT Parameters ---------- annotation_tiers : list List of the annotation tiers to store data from the TextGrid hierarchy : Hierarchy Basic hierarchy of the TextGrid make_transcription : bool Flag for whether to add a transcription property to words based on phones they contain stop_check : callable Function to check for whether parsing should stop call_back : callable Function to report progress in parsing """ name = 'LabbCat' word_label = 'transcript' phone_label = 'segment' speaker_first = False def __init__(self, annotation_tiers, hierarchy, make_transcription=True, stop_check=None, call_back=None): super(AlignerParser, self).__init__(annotation_tiers, hierarchy, make_transcription, False, stop_check, call_back) self.speaker_parser = DirectorySpeakerParser() def load_textgrid(self, path): """ Load a TextGrid file. Additionally ignore duplicated tier names as they can sometimes be exported erroneously from LaBB-CAT. Parameters ---------- path : str Path to the TextGrid file Returns ------- :class:`~textgrid.TextGrid` TextGrid object """ try: tg = textgrid.openTextgrid(path, includeEmptyIntervals=True) new_tiers = [] dup_tiers_maxes = {k:0 for k,v in Counter([t for t in tg.tierNameList]).items() if v > 1} dup_tiers_inds = {k:0 for k in dup_tiers_maxes.keys()} for i, t in enumerate(tg.tierNameList): if t in dup_tiers_maxes: if len(t) > dup_tiers_maxes[t]: dup_tiers_maxes[t] = len(t) dup_tiers_inds[t] = i for i, t in enumerate(tg.tierNameList): if t in dup_tiers_maxes: if i != dup_tiers_inds[t]: continue new_tiers.append(t) tg.tiers = new_tiers return tg except Exception as e: print('There was an issue parsing {}:'.format(path)) raise