Source code for polyglotdb.io.helper

import hashlib
import logging
import operator
import os
import wave
from collections import Counter

from praatio import textgrid

from polyglotdb.exceptions import DelimiterError, TextGridError

ATT_TYPES = ["orthography", "transcription", "numeric", "morpheme", "tobi", "grouping"]

tobi_characters = set("LH%-+!*")
morph_delimiters = set("-=")


def get_n_channels(file_path):
    """
    Get the number of channels in an audio file

    Parameters
    ----------
    file_path : str
        Path to audio file

    Returns
    -------
    int
        Number of channels
    """
    with wave.open(file_path, "rb") as soundf:
        n_channels = soundf.getnchannels()
    return n_channels


def normalize_values_for_neo4j(dictionary):
    """
    Sanitizes dictionary for neo4j format by making non-existent values be the string 'NULL'

    Parameters
    ----------
    dictionary : dict
        the dictionary to be sanitized

    Returns
    -------
    dict
        sanitized dictionary
    """
    out = {}
    for k, v in dictionary.items():
        if isinstance(v, list):
            v = ".".join(map(str, v))
        if not v:
            v = "NULL"
        out[k] = v
    return out


def guess_type(values, trans_delimiters=None):
    """
    Given a set of values, guesses the value type (numeric, transcription, grouping, tobi, morpheme, orthography)

    Parameters
    ----------
    values : dict
        a dictionary of the possible values
    trans_delimiters : list
        List of transcription delimiters, optional

    Returns
    -------
    str
        most probable type (highest count)
    """
    if trans_delimiters is None:
        trans_delimiters = [".", " ", ";", ","]
    probable_values = {x: 0 for x in ATT_TYPES}
    for i, v in enumerate(values):
        try:
            _ = float(v)
            probable_values["numeric"] += 1
            continue
        except ValueError:
            for d in trans_delimiters:
                if d in v:
                    probable_values["transcription"] += 1
                    break
            else:
                if v == "":
                    probable_values["grouping"] += 1
                elif set(v).issubset(tobi_characters):
                    probable_values["tobi"] += 1
                elif len(set(v) & morph_delimiters) > 0:
                    probable_values["morpheme"] += 1
                else:
                    probable_values["orthography"] += 1
    if probable_values["orthography"] > 0:
        del probable_values["grouping"]
    return max(probable_values.items(), key=operator.itemgetter(1))[0]


def guess_trans_delimiter(values):
    """ "
    Given a set of values, guess the transition delimiter

    Parameters
    ----------
     values : dict
        a dictionary of the possible values

    Returns
    -------
    str
        the most probable delimiter (highest count)

    """
    trans_delimiters = [".", " ", ";", ","]
    probable_values = {x: 0 for x in trans_delimiters}
    for v in values:
        for delim in trans_delimiters:
            if delim in v:
                probable_values[delim] += 1
    return max(probable_values.items(), key=operator.itemgetter(1))[0]


[docs] def inspect_directory(directory): """ Function to inspect a directory and return the most likely type of files within it. Searches currently for 'textgrid', 'text', 'buckeye', 'timit', and 'partitur' file types. Parameters ---------- directory : str Full path to the directory Returns ------- str Most likely type of files dict Dictionary of the found files separated by the types searched for """ types = ["textgrid", "text", "buckeye", "timit", "partitur"] counter = {x: 0 for x in types} relevant_files = {x: [] for x in types} for root, subdirs, files in os.walk(directory): for f in files: ext = os.path.splitext(f)[-1].lower() if ext == ".textgrid": t = "textgrid" elif ext == ".txt": t = "text" elif ext == ".words": t = "buckeye" elif ext == ".wrd": t = "timit" elif ext == ".par,2": t = "partitur" else: continue counter[t] += 1 relevant_files[t].append(f) max_value = max(counter.values()) for t in ["textgrid", "buckeye", "timit", "text", "partitur"]: if counter[t] == max_value: likely_type = t break return likely_type, relevant_files
[docs] def text_to_lines(path): """ Parse a text file into lines. Parameters ---------- path : str Fully specified path to text file Returns ------- list Non-empty lines in the text file """ delimiter = None with open(path, encoding="utf-8-sig", mode="r") as f: text = f.read() if delimiter is not None and delimiter not in text: e = DelimiterError( "The delimiter specified does not create multiple words. Please specify another delimiter." ) raise (e) lines = [x.strip().split(delimiter) for x in text.splitlines() if x.strip() != ""] return lines
def most_frequent_value(dictionary): """ Gets the most frequent value in the dictionary Parameters ---------- dictionary : dict The dictionary to search through Returns ------- object the most frequent value """ c = Counter(dictionary.values()) return max(c.keys(), key=lambda x: c[x]) def calculate_lines_per_gloss(lines): """ Calculates lines per gloss of lines Parameters ---------- lines : list lines in the corpus Returns ------- int the count of lines per gloss """ line_counts = [len(x[1]) for x in lines] equaled = [] number = 1 for i, line in enumerate(line_counts): if i == 0: equaled.append(False) else: equaled.append(line == line_counts[i - 1]) if False not in equaled[1:]: # All lines happen to have the same length for i in range(2, 6): if len(lines) % i == 0: number = i else: false_intervals = [] ind = 0 for i, e in enumerate(equaled): if i == 0: continue if not e: false_intervals.append(i - ind) ind = i false_intervals.append(i + 1 - ind) counter = Counter(false_intervals) number = max(counter.keys(), key=lambda x: (counter[x], x)) if number > 10: prev_maxes = {number} while number > 10: prev_maxes.add(number) number = max(x for x in false_intervals if x not in prev_maxes) return number def ilg_text_to_lines(path): """ Converts an ilg file to text lines Parameters ---------- path : string path to ilg file Returns ------- list a sanitized list of lines in the file """ delimiter = None with open(path, encoding="utf-8-sig", mode="r") as f: text = f.read() if delimiter is not None and delimiter not in text: e = DelimiterError( "The delimiter specified does not create multiple words. Please specify another delimiter." ) raise (e) lines = enumerate(text.splitlines()) lines = [(x[0], x[1].strip().split(delimiter)) for x in lines if x[1].strip() != ""] return lines
[docs] def find_wav_path(path): """ Find a sound file for a given file, by looking for a .wav file with the same base name as the given path Parameters ---------- path : str Full path to an annotation file Returns ------- str or None Full path of the wav file if it exists or None if it does not """ name, ext = os.path.splitext(path) wav_path = name + ".wav" if os.path.exists(wav_path): return wav_path wav_path = name + ".WAV" if os.path.exists(wav_path): return wav_path return None
def log_annotation_types(annotation_types): """ Writes annotation types to log Parameters ---------- annotation_types : list a list of types of annotations in a corpus """ logging.info("Annotation type info") logging.info("--------------------") logging.info("") for a in annotation_types: logging.info(a.pretty_print()) def make_type_id(type_values, corpus): """ Construct a type ID from the type values and the corpus name Parameters ---------- type_values : list list of type values corpus : str the corpus Returns ------- str a hex string for the type ID """ m = hashlib.sha1() value = " ".join(map(str, type_values)) value += " " + corpus m.update(value.encode()) return m.hexdigest() def guess_textgrid_format(path): """ Given a directory, tries to guess what format the TextGrid files are in Parameters ---------- path : str the path of the directory containing the TextGrid files Returns ------- str or None textgrid format or None if file is not textgrid and directory doesn't contain TextGrid files """ from .inspect import inspect_fave, inspect_labbcat, inspect_maus, inspect_mfa if os.path.isdir(path): counts = {"mfa": 0, "labbcat": 0, "fave": 0, "maus": 0, None: 0} for root, subdirs, files in os.walk(path): for f in files: if not f.lower().endswith(".textgrid"): continue tg_path = os.path.join(root, f) try: tg = textgrid.openTextgrid(tg_path, includeEmptyIntervals=True) except ValueError as e: raise ( TextGridError( "The file {} could not be parsed: {}".format(tg_path, str(e)) ) ) labbcat_parser = inspect_labbcat(tg_path) mfa_parser = inspect_mfa(tg_path) fave_parser = inspect_fave(tg_path) maus_parser = inspect_maus(path) if labbcat_parser._is_valid(tg): counts["labbcat"] += 1 elif mfa_parser._is_valid(tg): counts["mfa"] += 1 elif fave_parser._is_valid(tg): counts["fave"] += 1 elif maus_parser._is_valid(tg): counts["maus"] += 1 else: counts[None] += 1 return max(counts.keys(), key=lambda x: counts[x]) elif path.lower().endswith(".textgrid"): try: tg = textgrid.openTextgrid(path, includeEmptyIntervals=True) except ValueError as e: raise (TextGridError("The file {} could not be parsed: {}".format(path, str(e)))) labbcat_parser = inspect_labbcat(path) mfa_parser = inspect_mfa(path) fave_parser = inspect_fave(path) maus_parser = inspect_maus(path) if labbcat_parser._is_valid(tg): return "labbcat" elif mfa_parser._is_valid(tg): return "mfa" elif fave_parser._is_valid(tg): return "fave" elif maus_parser._is_valid(tg): return "maus" return None