import math
import os
from praatio import textgrid
from polyglotdb.io.helper import guess_trans_delimiter, guess_type
from polyglotdb.io.parsers import TextgridParser
from polyglotdb.io.types.parsing import (
BreakIndexTier,
GroupingTier,
OrthographyTier,
SegmentTier,
TextOrthographyTier,
TobiTier,
TranscriptionTier,
)
from polyglotdb.structure import Hierarchy
def calculate_probability(x, mean, stdev):
"""
Calculates the probability that a given tier is a word or phone
Parameters
----------
x : float
duration of the object in question
mean : float
mean duration of that type of object
stdev : float
standard deviation from mean
"""
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
def word_probability(average_duration):
"""
Calculates probability of tier being a word tier
Parameters
----------
average_duration : float
the average duration of elements in the tier
Returns
-------
float
the probability that the tier is a word tier
"""
mean = 0.2465409 # Taken from the Buckeye corpus
sd = 0.03175723
return calculate_probability(average_duration, mean, sd)
def segment_probability(average_duration):
"""
Calculates probability of tier being a phone tier
Parameters
----------
average_duration : float
the average duration of elements in the tier
Returns
-------
float
the probability that the tier is a phone tier
"""
mean = 0.08327773 # Taken from the Buckeye corpus
sd = 0.03175723 # Actually=0.009260103
return calculate_probability(average_duration, mean, sd)
def uniqueLabels(tier):
"""
Gets the label from the tier, removing duplicates
Parameters
----------
tier : IntervalTier
the tier to collect labels from
Returns
-------
set
label from the tier
"""
if isinstance(tier, textgrid.IntervalTier):
return set(x for _, _, x in tier.entries)
else:
return set(x for _, x in tier.entries)
def average_duration(tier):
"""
Gets the average duration of elements in a tier
Parameters
----------
tier : IntervalTier
the tier to get duration from
Returns
-------
double
average duration
"""
if isinstance(tier, textgrid.IntervalTier):
return sum(float(end) - float(begin) for (begin, end, _) in tier.entries) / len(
tier.entries
)
else:
return float(tier.maxTime) / len(tier.entries)
def averageLabelLen(tier):
"""
Get the average label length in a tier
Parameters
----------
tier : IntervalTier
the tier to collect labels from
Returns
-------
double
average label length
"""
labels = uniqueLabels(tier)
if not labels:
return 0
return sum(len(lab) for lab in labels) / len(labels)
def figure_linguistic_type(labels):
"""
Gets linguistic type for labels
Parameters
----------
labels : list of lists
the labels of a tier
Returns
-------
the linguistic type
"""
if len(labels) == 0:
return None
elif len(labels) == 1:
return labels[0][0]
label = min(labels, key=lambda x: x[1])
return label[0]
def guess_tiers(tg):
"""
Guesses whether tiers are words or segments
Parameters
----------
tg : TextGrid
the textgrid object
Returns
-------
tier_guesses : dict
the tiers and their likelihoods
hierarchy : `~polyglotdb.structure.Hierarchy`
the hierarchy object
"""
tier_properties = {}
tier_guesses = {}
for i, tier_name in enumerate(tg.tierNames):
ti = tg.getTier(tier_name)
if len(ti.entries) == 0:
continue
ti.maxTime = tg.maxTimestamp
tier_properties[ti.name] = (i, average_duration(ti))
for k, v in tier_properties.items():
if v is None:
continue
word_p = word_probability(v[1])
phone_p = segment_probability(v[1])
if word_p > phone_p:
tier_guesses[k] = ("word", v[0])
else:
tier_guesses[k] = ("segment", v[0])
word_labels = [(k, v[1]) for k, v in tier_guesses.items() if v[0] == "word"]
phone_labels = [(k, v[1]) for k, v in tier_guesses.items() if v[0] == "segment"]
word_type = figure_linguistic_type(word_labels)
phone_type = figure_linguistic_type(phone_labels)
for k, v in tier_guesses.items():
if "word" in k.lower() or v[0] == "word":
tier_guesses[k] = word_type
else:
tier_guesses[k] = phone_type
h = {word_type: None}
if phone_type is not None:
h[phone_type] = word_type
hierarchy = Hierarchy(h)
return tier_guesses, hierarchy
[docs]
def inspect_textgrid(path):
"""
Generate a :class:`~polyglotdb.io.parsers.textgrid.TextgridParser` for a specified TextGrid file
Parameters
----------
path : str
Full path to TextGrid file
Returns
-------
:class:`~polyglotdb.io.parsers.textgrid.TextgridParser`
Autodetected parser for the TextGrid file
"""
trans_delimiters = [".", " ", ";", ","]
textgrids = []
if os.path.isdir(path):
for root, subdirs, files in os.walk(path):
for filename in files:
if not filename.lower().endswith(".textgrid"):
continue
textgrids.append(os.path.join(root, filename))
else:
textgrids.append(path)
anno_types = []
for t in textgrids:
tg = textgrid.openTextgrid(t, includeEmptyIntervals=True)
if len(anno_types) == 0:
tier_guesses, hierarchy = guess_tiers(tg)
for i, tier_name in enumerate(tg.tierNames):
ti = tg.getTier(tier_name)
if tier_name not in tier_guesses:
a = OrthographyTier("word", "word")
a.ignored = True
elif tier_guesses[tier_name] == "segment":
a = SegmentTier(tier_name, tier_guesses[ti.name])
else:
labels = uniqueLabels(ti)
cat = guess_type(labels, trans_delimiters)
if cat == "transcription":
a = TranscriptionTier(ti.name, tier_guesses[ti.name])
a.trans_delimiter = guess_trans_delimiter(labels)
elif cat == "numeric":
if isinstance(ti, textgrid.IntervalTier):
raise (NotImplementedError)
else:
a = BreakIndexTier(ti.name, tier_guesses[ti.name])
elif cat == "orthography":
if isinstance(ti, textgrid.IntervalTier):
a = OrthographyTier(ti.name, tier_guesses[ti.name])
else:
a = TextOrthographyTier(ti.name, tier_guesses[ti.name])
elif cat == "tobi":
a = TobiTier(tier_name, tier_guesses[ti.name])
elif cat == "grouping":
a = GroupingTier(ti.name, tier_guesses[ti.name])
else:
print(tier_name)
print(cat)
raise (NotImplementedError)
if not a.ignored:
if isinstance(ti, textgrid.IntervalTier):
a.add(
((text.strip(), begin, end) for (begin, end, text) in ti.entries),
save=False,
)
else:
a.add(
((text.strip(), time) for time, text in ti.entries),
save=False,
)
anno_types.append(a)
else:
for i, tier_name in enumerate(tg.tierNames):
ti = tg.getTier(tier_name)
if anno_types[i].ignored:
continue
if isinstance(ti, textgrid.IntervalTier):
anno_types[i].add(
((text.strip(), begin, end) for (begin, end, text) in ti.entries),
save=False,
)
else:
anno_types[i].add(
((text.strip(), time) for time, text in ti.entries), save=False
)
parser = TextgridParser(anno_types, hierarchy)
return parser