Source code for polyglotdb.io.inspect.textgrid

import math
import os

from praatio import textgrid

from polyglotdb.io.helper import guess_trans_delimiter, guess_type
from polyglotdb.io.parsers import TextgridParser
from polyglotdb.io.types.parsing import (
    BreakIndexTier,
    GroupingTier,
    OrthographyTier,
    SegmentTier,
    TextOrthographyTier,
    TobiTier,
    TranscriptionTier,
)
from polyglotdb.structure import Hierarchy


def calculate_probability(x, mean, stdev):
    """
    Calculates the probability that a given tier is a word or phone

    Parameters
    ----------
    x : float
        duration of the object in question
    mean : float
        mean duration of that type of object
    stdev : float
        standard deviation from mean
    """
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent


def word_probability(average_duration):
    """
    Calculates probability of tier being a word tier

    Parameters
    ----------
    average_duration : float
        the average duration of elements in the tier

    Returns
    -------
    float
        the probability that the tier is a word tier
    """
    mean = 0.2465409  # Taken from the Buckeye corpus
    sd = 0.03175723
    return calculate_probability(average_duration, mean, sd)


def segment_probability(average_duration):
    """
    Calculates probability of tier being a phone tier

    Parameters
    ----------
    average_duration : float
        the average duration of elements in the tier

    Returns
    -------
    float
        the probability that the tier is a phone tier
    """
    mean = 0.08327773  # Taken from the Buckeye corpus
    sd = 0.03175723  # Actually=0.009260103
    return calculate_probability(average_duration, mean, sd)


def uniqueLabels(tier):
    """
    Gets the label from the tier, removing duplicates

    Parameters
    ----------
    tier : IntervalTier
        the tier to collect labels from

    Returns
    -------
    set
        label from the tier
    """
    if isinstance(tier, textgrid.IntervalTier):
        return set(x for _, _, x in tier.entries)
    else:
        return set(x for _, x in tier.entries)


def average_duration(tier):
    """
    Gets the average duration of elements in a tier

    Parameters
    ----------
    tier : IntervalTier
        the tier to get duration from

    Returns
    -------
    double
        average duration
    """

    if isinstance(tier, textgrid.IntervalTier):
        return sum(float(end) - float(begin) for (begin, end, _) in tier.entries) / len(
            tier.entries
        )
    else:
        return float(tier.maxTime) / len(tier.entries)


def averageLabelLen(tier):
    """
    Get the average label length in a tier

    Parameters
    ----------
    tier : IntervalTier
        the tier to collect labels from

    Returns
    -------
    double
        average label length
    """
    labels = uniqueLabels(tier)
    if not labels:
        return 0
    return sum(len(lab) for lab in labels) / len(labels)


def figure_linguistic_type(labels):
    """
    Gets linguistic type for labels

    Parameters
    ----------
    labels : list of lists
        the labels of a tier

    Returns
    -------

        the linguistic type
    """
    if len(labels) == 0:
        return None
    elif len(labels) == 1:
        return labels[0][0]
    label = min(labels, key=lambda x: x[1])
    return label[0]


def guess_tiers(tg):
    """
    Guesses whether tiers are words or segments

    Parameters
    ----------
    tg : TextGrid
        the textgrid object

    Returns
    -------
    tier_guesses : dict
        the tiers and their likelihoods
    hierarchy : `~polyglotdb.structure.Hierarchy`
        the hierarchy object
    """
    tier_properties = {}
    tier_guesses = {}
    for i, tier_name in enumerate(tg.tierNames):
        ti = tg.getTier(tier_name)
        if len(ti.entries) == 0:
            continue
        ti.maxTime = tg.maxTimestamp
        tier_properties[ti.name] = (i, average_duration(ti))
    for k, v in tier_properties.items():
        if v is None:
            continue
        word_p = word_probability(v[1])
        phone_p = segment_probability(v[1])
        if word_p > phone_p:
            tier_guesses[k] = ("word", v[0])
        else:
            tier_guesses[k] = ("segment", v[0])
    word_labels = [(k, v[1]) for k, v in tier_guesses.items() if v[0] == "word"]
    phone_labels = [(k, v[1]) for k, v in tier_guesses.items() if v[0] == "segment"]
    word_type = figure_linguistic_type(word_labels)
    phone_type = figure_linguistic_type(phone_labels)
    for k, v in tier_guesses.items():
        if "word" in k.lower() or v[0] == "word":
            tier_guesses[k] = word_type
        else:
            tier_guesses[k] = phone_type
    h = {word_type: None}
    if phone_type is not None:
        h[phone_type] = word_type
    hierarchy = Hierarchy(h)
    return tier_guesses, hierarchy



[docs]
def inspect_textgrid(path):
    """
    Generate a :class:`~polyglotdb.io.parsers.textgrid.TextgridParser` for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    :class:`~polyglotdb.io.parsers.textgrid.TextgridParser`
        Autodetected parser for the TextGrid file
    """
    trans_delimiters = [".", " ", ";", ","]
    textgrids = []
    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith(".textgrid"):
                    continue
                textgrids.append(os.path.join(root, filename))
    else:
        textgrids.append(path)
    anno_types = []
    for t in textgrids:
        tg = textgrid.openTextgrid(t, includeEmptyIntervals=True)
        if len(anno_types) == 0:
            tier_guesses, hierarchy = guess_tiers(tg)
            for i, tier_name in enumerate(tg.tierNames):
                ti = tg.getTier(tier_name)
                if tier_name not in tier_guesses:
                    a = OrthographyTier("word", "word")
                    a.ignored = True
                elif tier_guesses[tier_name] == "segment":
                    a = SegmentTier(tier_name, tier_guesses[ti.name])
                else:
                    labels = uniqueLabels(ti)
                    cat = guess_type(labels, trans_delimiters)
                    if cat == "transcription":
                        a = TranscriptionTier(ti.name, tier_guesses[ti.name])
                        a.trans_delimiter = guess_trans_delimiter(labels)
                    elif cat == "numeric":
                        if isinstance(ti, textgrid.IntervalTier):
                            raise (NotImplementedError)
                        else:
                            a = BreakIndexTier(ti.name, tier_guesses[ti.name])
                    elif cat == "orthography":
                        if isinstance(ti, textgrid.IntervalTier):
                            a = OrthographyTier(ti.name, tier_guesses[ti.name])
                        else:
                            a = TextOrthographyTier(ti.name, tier_guesses[ti.name])
                    elif cat == "tobi":
                        a = TobiTier(tier_name, tier_guesses[ti.name])
                    elif cat == "grouping":
                        a = GroupingTier(ti.name, tier_guesses[ti.name])
                    else:
                        print(tier_name)
                        print(cat)
                        raise (NotImplementedError)
                if not a.ignored:
                    if isinstance(ti, textgrid.IntervalTier):
                        a.add(
                            ((text.strip(), begin, end) for (begin, end, text) in ti.entries),
                            save=False,
                        )
                    else:
                        a.add(
                            ((text.strip(), time) for time, text in ti.entries),
                            save=False,
                        )
                anno_types.append(a)
        else:
            for i, tier_name in enumerate(tg.tierNames):
                ti = tg.getTier(tier_name)
                if anno_types[i].ignored:
                    continue
                if isinstance(ti, textgrid.IntervalTier):
                    anno_types[i].add(
                        ((text.strip(), begin, end) for (begin, end, text) in ti.entries),
                        save=False,
                    )
                else:
                    anno_types[i].add(
                        ((text.strip(), time) for time, text in ti.entries), save=False
                    )

    parser = TextgridParser(anno_types, hierarchy)
    return parser