import os
from praatio.utilities.errors import DuplicateTierName
from praatio import textgrid
from polyglotdb.exceptions import TextGridError
from polyglotdb.io.types.parsing import Orthography, Transcription
from .base import BaseParser, DiscourseData
from ..helper import find_wav_path
[docs]
class TextgridParser(BaseParser):
"""
Parser for Praat TextGrid files.
Parameters
----------
annotation_tiers: list
Annotation types of the files to parse
hierarchy : :class:`~polyglotdb.structure.Hierarchy`
Details of how linguistic types relate to one another
make_transcription : bool, defaults to True
If true, create a word attribute for transcription based on segments
that are contained by the word
stop_check : callable, optional
Function to check whether to halt parsing
call_back : callable, optional
Function to output progress messages
"""
_extensions = ['.textgrid']
def __init__(self, annotation_tiers, hierarchy, make_transcription=True,
make_label=False,
stop_check=None, call_back=None):
super(TextgridParser, self).__init__(annotation_tiers, hierarchy,
make_transcription=True, make_label=True,
stop_check=stop_check, call_back=call_back)
def load_textgrid(self, path):
"""
Load a TextGrid file
Parameters
----------
path : str
Path to the TextGrid file
Returns
-------
:class:`~praatio.textgrid.TextGrid`
TextGrid object
"""
try:
tg = textgrid.openTextgrid(path, includeEmptyIntervals=True)
except (AssertionError, ValueError, DuplicateTierName) as e:
raise (TextGridError('The file {} could not be parsed: {}'.format(path, str(e))))
return tg
def parse_discourse(self, path, types_only=False):
"""
Parse a TextGrid file for later importing.
Parameters
----------
path : str
Path to TextGrid file
types_only : bool
Flag for whether to only save type information, ignoring the token information
Returns
-------
:class:`~polyglotdb.io.discoursedata.DiscourseData`
Parsed data from the file
"""
tg = self.load_textgrid(path)
if len(tg.tierNameList) != len(self.annotation_tiers):
raise (TextGridError(
"The TextGrid ({}) does not have the same number of interval tiers as the number of annotation types specified.".format(
path)))
name = os.path.splitext(os.path.split(path)[1])[0]
if self.speaker_parser is not None:
speaker = self.speaker_parser.parse_path(path)
else:
speaker = None
for a in self.annotation_tiers:
a.reset()
a.speaker = speaker
# Parse the tiers
for i, tier_name in enumerate(tg.tierNameList):
ti = tg.tierDict[tier_name]
if isinstance(ti, textgrid.IntervalTier):
self.annotation_tiers[i].add(( (text.strip(), begin, end) for (begin, end, text) in ti.entryList))
else:
self.annotation_tiers[i].add(((text.strip(), time) for time, text in ti.entryList))
is_empty_textgrid = True
for t in self.annotation_tiers:
for interval in t:
if isinstance(interval, Orthography):
if interval.label != "":
is_empty_textgrid = False
break
if isinstance(interval, Transcription):
if interval._list != []:
is_empty_textgrid = False
break
if is_empty_textgrid:
return None
pg_annotations = self._parse_annotations(types_only)
data = DiscourseData(name, pg_annotations, self.hierarchy)
for a in self.annotation_tiers:
a.reset()
data.wav_path = find_wav_path(path)
return data