Source code for polyglotdb.io.inspect.textgrid

import math
import os

from praatio import textgrid

from polyglotdb.io.helper import guess_trans_delimiter, guess_type
from polyglotdb.io.parsers import TextgridParser
from polyglotdb.io.types.parsing import (
    BreakIndexTier,
    GroupingTier,
    OrthographyTier,
    SegmentTier,
    TextOrthographyTier,
    TobiTier,
    TranscriptionTier,
)
from polyglotdb.structure import Hierarchy


def calculate_probability(x, mean, stdev):
    """
    Calculates the probability that a given tier is a word or phone

    Parameters
    ----------
    x : float
        duration of the object in question
    mean : float
        mean duration of that type of object
    stdev : float
        standard deviation from mean
    """
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent


def word_probability(average_duration):
    """
    Calculates probability of tier being a word tier

    Parameters
    ----------
    average_duration : float
        the average duration of elements in the tier

    Returns
    -------
    float
        the probability that the tier is a word tier
    """
    mean = 0.2465409  # Taken from the Buckeye corpus
    sd = 0.03175723
    return calculate_probability(average_duration, mean, sd)


def segment_probability(average_duration):
    """
    Calculates probability of tier being a phone tier

    Parameters
    ----------
    average_duration : float
        the average duration of elements in the tier

    Returns
    -------
    float
        the probability that the tier is a phone tier
    """
    mean = 0.08327773  # Taken from the Buckeye corpus
    sd = 0.03175723  # Actually=0.009260103
    return calculate_probability(average_duration, mean, sd)


def uniqueLabels(tier):
    """
    Gets the label from the tier, removing duplicates

    Parameters
    ----------
    tier : IntervalTier
        the tier to collect labels from

    Returns
    -------
    set
        label from the tier
    """
    if isinstance(tier, textgrid.IntervalTier):
        return set(x for _, _, x in tier.entries)
    else:
        return set(x for _, x in tier.entries)


def average_duration(tier):
    """
    Gets the average duration of elements in a tier

    Parameters
    ----------
    tier : IntervalTier
        the tier to get duration from

    Returns
    -------
    double
        average duration
    """

    if isinstance(tier, textgrid.IntervalTier):
        return sum(float(end) - float(begin) for (begin, end, _) in tier.entries) / len(
            tier.entries
        )
    else:
        return float(tier.maxTime) / len(tier.entries)


def averageLabelLen(tier):
    """
    Get the average label length in a tier

    Parameters
    ----------
    tier : IntervalTier
        the tier to collect labels from

    Returns
    -------
    double
        average label length
    """
    labels = uniqueLabels(tier)
    if not labels:
        return 0
    return sum(len(lab) for lab in labels) / len(labels)


def figure_linguistic_type(labels):
    """
    Gets linguistic type for labels

    Parameters
    ----------
    labels : list of lists
        the labels of a tier

    Returns
    -------

        the linguistic type
    """
    if len(labels) == 0:
        return None
    elif len(labels) == 1:
        return labels[0][0]
    label = min(labels, key=lambda x: x[1])
    return label[0]


def guess_tiers(tg):
    """
    Guesses whether tiers are words or segments

    Parameters
    ----------
    tg : TextGrid
        the textgrid object

    Returns
    -------
    tier_guesses : dict
        the tiers and their likelihoods
    hierarchy : `~polyglotdb.structure.Hierarchy`
        the hierarchy object
    """
    tier_properties = {}
    tier_guesses = {}
    for i, tier_name in enumerate(tg.tierNames):
        ti = tg.getTier(tier_name)
        if len(ti.entries) == 0:
            continue
        ti.maxTime = tg.maxTimestamp
        tier_properties[ti.name] = (i, average_duration(ti))
    for k, v in tier_properties.items():
        if v is None:
            continue
        word_p = word_probability(v[1])
        phone_p = segment_probability(v[1])
        if word_p > phone_p:
            tier_guesses[k] = ("word", v[0])
        else:
            tier_guesses[k] = ("segment", v[0])
    word_labels = [(k, v[1]) for k, v in tier_guesses.items() if v[0] == "word"]
    phone_labels = [(k, v[1]) for k, v in tier_guesses.items() if v[0] == "segment"]
    word_type = figure_linguistic_type(word_labels)
    phone_type = figure_linguistic_type(phone_labels)
    for k, v in tier_guesses.items():
        if "word" in k.lower() or v[0] == "word":
            tier_guesses[k] = word_type
        else:
            tier_guesses[k] = phone_type
    h = {word_type: None}
    if phone_type is not None:
        h[phone_type] = word_type
    hierarchy = Hierarchy(h)
    return tier_guesses, hierarchy


[docs] def inspect_textgrid(path): """ Generate a :class:`~polyglotdb.io.parsers.textgrid.TextgridParser` for a specified TextGrid file Parameters ---------- path : str Full path to TextGrid file Returns ------- :class:`~polyglotdb.io.parsers.textgrid.TextgridParser` Autodetected parser for the TextGrid file """ trans_delimiters = [".", " ", ";", ","] textgrids = [] if os.path.isdir(path): for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith(".textgrid"): continue textgrids.append(os.path.join(root, filename)) else: textgrids.append(path) anno_types = [] for t in textgrids: tg = textgrid.openTextgrid(t, includeEmptyIntervals=True) if len(anno_types) == 0: tier_guesses, hierarchy = guess_tiers(tg) for i, tier_name in enumerate(tg.tierNames): ti = tg.getTier(tier_name) if tier_name not in tier_guesses: a = OrthographyTier("word", "word") a.ignored = True elif tier_guesses[tier_name] == "segment": a = SegmentTier(tier_name, tier_guesses[ti.name]) else: labels = uniqueLabels(ti) cat = guess_type(labels, trans_delimiters) if cat == "transcription": a = TranscriptionTier(ti.name, tier_guesses[ti.name]) a.trans_delimiter = guess_trans_delimiter(labels) elif cat == "numeric": if isinstance(ti, textgrid.IntervalTier): raise (NotImplementedError) else: a = BreakIndexTier(ti.name, tier_guesses[ti.name]) elif cat == "orthography": if isinstance(ti, textgrid.IntervalTier): a = OrthographyTier(ti.name, tier_guesses[ti.name]) else: a = TextOrthographyTier(ti.name, tier_guesses[ti.name]) elif cat == "tobi": a = TobiTier(tier_name, tier_guesses[ti.name]) elif cat == "grouping": a = GroupingTier(ti.name, tier_guesses[ti.name]) else: print(tier_name) print(cat) raise (NotImplementedError) if not a.ignored: if isinstance(ti, textgrid.IntervalTier): a.add( ((text.strip(), begin, end) for (begin, end, text) in ti.entries), save=False, ) else: a.add( ((text.strip(), time) for time, text in ti.entries), save=False, ) anno_types.append(a) else: for i, tier_name in enumerate(tg.tierNames): ti = tg.getTier(tier_name) if anno_types[i].ignored: continue if isinstance(ti, textgrid.IntervalTier): anno_types[i].add( ((text.strip(), begin, end) for (begin, end, text) in ti.entries), save=False, ) else: anno_types[i].add( ((text.strip(), time) for time, text in ti.entries), save=False ) parser = TextgridParser(anno_types, hierarchy) return parser