Source code for polyglotdb.io.helper

import hashlib
import logging
import operator
import os
import wave
from collections import Counter

from praatio import textgrid

from polyglotdb.exceptions import DelimiterError, TextGridError

ATT_TYPES = ["orthography", "transcription", "numeric", "morpheme", "tobi", "grouping"]

tobi_characters = set("LH%-+!*")
morph_delimiters = set("-=")


def get_n_channels(file_path):
    """
    Get the number of channels in an audio file

    Parameters
    ----------
    file_path : str
        Path to audio file

    Returns
    -------
    int
        Number of channels
    """
    with wave.open(file_path, "rb") as soundf:
        n_channels = soundf.getnchannels()
    return n_channels


def normalize_values_for_neo4j(dictionary):
    """
    Sanitizes dictionary for neo4j format by making non-existent values be the string 'NULL'

    Parameters
    ----------
    dictionary : dict
        the dictionary to be sanitized

    Returns
    -------
    dict
        sanitized dictionary
    """
    out = {}
    for k, v in dictionary.items():
        if isinstance(v, list):
            v = ".".join(map(str, v))
        if not v:
            v = "NULL"
        out[k] = v
    return out


def guess_type(values, trans_delimiters=None):
    """
    Given a set of values, guesses the value type (numeric, transcription, grouping, tobi, morpheme, orthography)

    Parameters
    ----------
    values : dict
        a dictionary of the possible values
    trans_delimiters : list
        List of transcription delimiters, optional

    Returns
    -------
    str
        most probable type (highest count)
    """
    if trans_delimiters is None:
        trans_delimiters = [".", " ", ";", ","]
    probable_values = {x: 0 for x in ATT_TYPES}
    for i, v in enumerate(values):
        try:
            _ = float(v)
            probable_values["numeric"] += 1
            continue
        except ValueError:
            for d in trans_delimiters:
                if d in v:
                    probable_values["transcription"] += 1
                    break
            else:
                if v == "":
                    probable_values["grouping"] += 1
                elif set(v).issubset(tobi_characters):
                    probable_values["tobi"] += 1
                elif len(set(v) & morph_delimiters) > 0:
                    probable_values["morpheme"] += 1
                else:
                    probable_values["orthography"] += 1
    if probable_values["orthography"] > 0:
        del probable_values["grouping"]
    return max(probable_values.items(), key=operator.itemgetter(1))[0]


def guess_trans_delimiter(values):
    """ "
    Given a set of values, guess the transition delimiter

    Parameters
    ----------
     values : dict
        a dictionary of the possible values

    Returns
    -------
    str
        the most probable delimiter (highest count)

    """
    trans_delimiters = [".", " ", ";", ","]
    probable_values = {x: 0 for x in trans_delimiters}
    for v in values:
        for delim in trans_delimiters:
            if delim in v:
                probable_values[delim] += 1
    return max(probable_values.items(), key=operator.itemgetter(1))[0]



[docs]
def inspect_directory(directory):
    """
    Function to inspect a directory and return the most likely type of
    files within it.

    Searches currently for 'textgrid', 'text', 'buckeye', 'timit', and 'partitur' file
    types.

    Parameters
    ----------
    directory : str
        Full path to the directory

    Returns
    -------
    str
        Most likely type of files
    dict
        Dictionary of the found files separated by the types searched for
    """
    types = ["textgrid", "text", "buckeye", "timit", "partitur"]
    counter = {x: 0 for x in types}
    relevant_files = {x: [] for x in types}
    for root, subdirs, files in os.walk(directory):
        for f in files:
            ext = os.path.splitext(f)[-1].lower()
            if ext == ".textgrid":
                t = "textgrid"
            elif ext == ".txt":
                t = "text"
            elif ext == ".words":
                t = "buckeye"
            elif ext == ".wrd":
                t = "timit"
            elif ext == ".par,2":
                t = "partitur"
            else:
                continue
            counter[t] += 1
            relevant_files[t].append(f)
    max_value = max(counter.values())
    for t in ["textgrid", "buckeye", "timit", "text", "partitur"]:
        if counter[t] == max_value:
            likely_type = t
            break

    return likely_type, relevant_files




[docs]
def text_to_lines(path):
    """
    Parse a text file into lines.

    Parameters
    ----------
    path : str
        Fully specified path to text file

    Returns
    -------
    list
        Non-empty lines in the text file
    """
    delimiter = None
    with open(path, encoding="utf-8-sig", mode="r") as f:
        text = f.read()
        if delimiter is not None and delimiter not in text:
            e = DelimiterError(
                "The delimiter specified does not create multiple words. Please specify another delimiter."
            )
            raise (e)
    lines = [x.strip().split(delimiter) for x in text.splitlines() if x.strip() != ""]
    return lines



def most_frequent_value(dictionary):
    """
    Gets the most frequent value in the dictionary

    Parameters
    ----------
    dictionary  : dict
        The dictionary to search through

    Returns
    -------
    object
        the most frequent value
    """
    c = Counter(dictionary.values())
    return max(c.keys(), key=lambda x: c[x])


def calculate_lines_per_gloss(lines):
    """
    Calculates lines per gloss of lines

    Parameters
    ----------
    lines : list
        lines in the corpus

    Returns
    -------
    int
        the count of lines per gloss
    """
    line_counts = [len(x[1]) for x in lines]
    equaled = []
    number = 1
    for i, line in enumerate(line_counts):
        if i == 0:
            equaled.append(False)
        else:
            equaled.append(line == line_counts[i - 1])
    if False not in equaled[1:]:
        # All lines happen to have the same length
        for i in range(2, 6):
            if len(lines) % i == 0:
                number = i
    else:
        false_intervals = []
        ind = 0
        for i, e in enumerate(equaled):
            if i == 0:
                continue
            if not e:
                false_intervals.append(i - ind)
                ind = i
        false_intervals.append(i + 1 - ind)
        counter = Counter(false_intervals)
        number = max(counter.keys(), key=lambda x: (counter[x], x))
        if number > 10:
            prev_maxes = {number}
            while number > 10:
                prev_maxes.add(number)
                number = max(x for x in false_intervals if x not in prev_maxes)
    return number


def ilg_text_to_lines(path):
    """
    Converts an ilg file to text lines

    Parameters
    ----------
    path : string
        path to ilg file

    Returns
    -------
    list
        a sanitized list of lines in the file
    """
    delimiter = None
    with open(path, encoding="utf-8-sig", mode="r") as f:
        text = f.read()
        if delimiter is not None and delimiter not in text:
            e = DelimiterError(
                "The delimiter specified does not create multiple words. Please specify another delimiter."
            )
            raise (e)
    lines = enumerate(text.splitlines())
    lines = [(x[0], x[1].strip().split(delimiter)) for x in lines if x[1].strip() != ""]
    return lines



[docs]
def find_wav_path(path):
    """
    Find a sound file for a given file, by looking for a .wav file with the
    same base name as the given path

    Parameters
    ----------
    path : str
        Full path to an annotation file

    Returns
    -------
    str or None
        Full path of the wav file if it exists or None if it does not
    """
    name, ext = os.path.splitext(path)

    wav_path = name + ".wav"
    if os.path.exists(wav_path):
        return wav_path

    wav_path = name + ".WAV"
    if os.path.exists(wav_path):
        return wav_path

    return None



def log_annotation_types(annotation_types):
    """
    Writes annotation types to log

    Parameters
    ----------
    annotation_types : list
        a list of types of annotations in a corpus
    """
    logging.info("Annotation type info")
    logging.info("--------------------")
    logging.info("")
    for a in annotation_types:
        logging.info(a.pretty_print())


def make_type_id(type_values, corpus):
    """
    Construct a type ID from the type values and the corpus name

    Parameters
    ----------
    type_values : list
        list of type values
    corpus : str
        the corpus

    Returns
    -------
    str
        a hex string for the type ID
    """
    m = hashlib.sha1()
    value = " ".join(map(str, type_values))
    value += " " + corpus
    m.update(value.encode())
    return m.hexdigest()


def guess_textgrid_format(path):
    """
    Given a directory, tries to guess what format the TextGrid files are in

    Parameters
    ----------
    path : str
        the path of the directory containing the TextGrid files

    Returns
    -------
    str or None
        textgrid format or None if file is not textgrid and directory doesn't contain TextGrid files
    """
    from .inspect import inspect_fave, inspect_labbcat, inspect_maus, inspect_mfa

    if os.path.isdir(path):
        counts = {"mfa": 0, "labbcat": 0, "fave": 0, "maus": 0, None: 0}
        for root, subdirs, files in os.walk(path):
            for f in files:
                if not f.lower().endswith(".textgrid"):
                    continue
                tg_path = os.path.join(root, f)
                try:
                    tg = textgrid.openTextgrid(tg_path, includeEmptyIntervals=True)
                except ValueError as e:
                    raise (
                        TextGridError(
                            "The file {} could not be parsed: {}".format(tg_path, str(e))
                        )
                    )

                labbcat_parser = inspect_labbcat(tg_path)
                mfa_parser = inspect_mfa(tg_path)
                fave_parser = inspect_fave(tg_path)
                maus_parser = inspect_maus(path)
                if labbcat_parser._is_valid(tg):
                    counts["labbcat"] += 1
                elif mfa_parser._is_valid(tg):
                    counts["mfa"] += 1
                elif fave_parser._is_valid(tg):
                    counts["fave"] += 1
                elif maus_parser._is_valid(tg):
                    counts["maus"] += 1
                else:
                    counts[None] += 1
        return max(counts.keys(), key=lambda x: counts[x])
    elif path.lower().endswith(".textgrid"):
        try:
            tg = textgrid.openTextgrid(path, includeEmptyIntervals=True)
        except ValueError as e:
            raise (TextGridError("The file {} could not be parsed: {}".format(path, str(e))))

        labbcat_parser = inspect_labbcat(path)
        mfa_parser = inspect_mfa(path)
        fave_parser = inspect_fave(path)
        maus_parser = inspect_maus(path)
        if labbcat_parser._is_valid(tg):
            return "labbcat"
        elif mfa_parser._is_valid(tg):
            return "mfa"
        elif fave_parser._is_valid(tg):
            return "fave"
        elif maus_parser._is_valid(tg):
            return "maus"
    return None