Source code for polyglotdb.io.helper


import os
import logging
import operator
import hashlib
import wave
from collections import Counter
from praatio import tgio


from polyglotdb.exceptions import DelimiterError, TextGridError

ATT_TYPES = ['orthography', 'transcription', 'numeric',
             'morpheme', 'tobi', 'grouping']

tobi_characters = set('LH%-+!*')
morph_delimiters = set('-=')


def get_n_channels(file_path):
    """
    Get the number of channels in an audio file

    Parameters
    ----------
    file_path : str
        Path to audio file

    Returns
    -------
    int
        Number of channels
    """
    with wave.open(file_path, 'rb') as soundf:
        n_channels = soundf.getnchannels()
    return n_channels


def normalize_values_for_neo4j(dictionary):
    """
    Sanitizes dictionary for neo4j format by making non-existent values be the string 'NULL'

    Parameters
    ----------
    dictionary : dict
        the dictionary to be sanitized

    Returns
    -------
    dict
        sanitized dictionary
    """
    out = {}
    for k, v in dictionary.items():
        if isinstance(v, list):
            v = '.'.join(map(str, v))
        if not v:
            v = 'NULL'
        out[k] = v
    return out


def guess_type(values, trans_delimiters=None):
    """
    Given a set of values, guesses the value type (numeric, transcription, grouping, tobi, morpheme, orthography)

    Parameters
    ----------
    values : dict
        a dictionary of the possible values
    trans_delimiters : list
        List of transcription delimiters, optional
    
    Returns
    -------
    str
        most probable type (highest count)
    """
    if trans_delimiters is None:
        trans_delimiters = ['.', ' ', ';', ',']
    probable_values = {x: 0 for x in ATT_TYPES}
    for i, v in enumerate(values):
        try:
            t = float(v)
            probable_values['numeric'] += 1
            continue
        except ValueError:
            for d in trans_delimiters:
                if d in v:
                    probable_values['transcription'] += 1
                    break
            else:
                if v == '':
                    probable_values['grouping'] += 1
                elif set(v).issubset(tobi_characters):
                    probable_values['tobi'] += 1
                elif len(set(v) & morph_delimiters) > 0:
                    probable_values['morpheme'] += 1
                else:
                    probable_values['orthography'] += 1
    if probable_values['orthography'] > 0:
        del probable_values['grouping']
    return max(probable_values.items(), key=operator.itemgetter(1))[0]


def guess_trans_delimiter(values):
    """"
    Given a set of values, guess the transition delimiter
    
    Parameters
    ----------
     values : dict
        a dictionary of the possible values

    Returns
    -------
    str
        the most probable delimiter (highest count)

    """
    trans_delimiters = ['.', ' ', ';', ',']
    probable_values = {x: 0 for x in trans_delimiters}
    for l in values:
        for delim in trans_delimiters:
            if delim in l:
                probable_values[delim] += 1
    return max(probable_values.items(), key=operator.itemgetter(1))[0]


[docs]def inspect_directory(directory): """ Function to inspect a directory and return the most likely type of files within it. Searches currently for 'textgrid', 'text', 'buckeye', 'timit', and 'partitur' file types. Parameters ---------- directory : str Full path to the directory Returns ------- str Most likely type of files dict Dictionary of the found files separated by the types searched for """ types = ['textgrid', 'text', 'buckeye', 'timit', 'partitur'] counter = {x: 0 for x in types} relevant_files = {x: [] for x in types} for root, subdirs, files in os.walk(directory): for f in files: ext = os.path.splitext(f)[-1].lower() if ext == '.textgrid': t = 'textgrid' elif ext == '.txt': t = 'text' elif ext == '.words': t = 'buckeye' elif ext == '.wrd': t = 'timit' elif ext == '.par,2': t = 'partitur' else: continue counter[t] += 1 relevant_files[t].append(f) max_value = max(counter.values()) for t in ['textgrid', 'buckeye', 'timit', 'text', 'partitur']: if counter[t] == max_value: likely_type = t break return likely_type, relevant_files
[docs]def text_to_lines(path): """ Parse a text file into lines. Parameters ---------- path : str Fully specified path to text file Returns ------- list Non-empty lines in the text file """ delimiter = None with open(path, encoding='utf-8-sig', mode='r') as f: text = f.read() if delimiter is not None and delimiter not in text: e = DelimiterError( 'The delimiter specified does not create multiple words. Please specify another delimiter.') raise (e) lines = [x.strip().split(delimiter) for x in text.splitlines() if x.strip() != ''] return lines
def most_frequent_value(dictionary): """ Gets the most frequent value in the dictionary Parameters ---------- dictionary : dict The dictionary to search through Returns ------- object the most frequent value """ c = Counter(dictionary.values()) return max(c.keys(), key=lambda x: c[x]) def calculate_lines_per_gloss(lines): """ Calculates lines per gloss of lines Parameters ---------- lines : list lines in the corpus Returns ------- int the count of lines per gloss """ line_counts = [len(x[1]) for x in lines] equaled = list() number = 1 for i, line in enumerate(line_counts): if i == 0: equaled.append(False) else: equaled.append(line == line_counts[i - 1]) if False not in equaled[1:]: # All lines happen to have the same length for i in range(2, 6): if len(lines) % i == 0: number = i else: false_intervals = list() ind = 0 for i, e in enumerate(equaled): if i == 0: continue if not e: false_intervals.append(i - ind) ind = i false_intervals.append(i + 1 - ind) counter = Counter(false_intervals) number = max(counter.keys(), key=lambda x: (counter[x], x)) if number > 10: prev_maxes = set([number]) while number > 10: prev_maxes.add(number) number = max(x for x in false_intervals if x not in prev_maxes) return number def ilg_text_to_lines(path): """ Converts an ilg file to text lines Parameters ---------- path : string path to ilg file Returns ------- list a sanitized list of lines in the file """ delimiter = None with open(path, encoding='utf-8-sig', mode='r') as f: text = f.read() if delimiter is not None and delimiter not in text: e = DelimiterError( 'The delimiter specified does not create multiple words. Please specify another delimiter.') raise (e) lines = enumerate(text.splitlines()) lines = [(x[0], x[1].strip().split(delimiter)) for x in lines if x[1].strip() != ''] return lines
[docs]def find_wav_path(path): """ Find a sound file for a given file, by looking for a .wav file with the same base name as the given path Parameters ---------- path : str Full path to an annotation file Returns ------- str or None Full path of the wav file if it exists or None if it does not """ name, ext = os.path.splitext(path) wav_path = name + '.wav' if os.path.exists(wav_path): return wav_path wav_path = name + '.WAV' if os.path.exists(wav_path): return wav_path return None
def log_annotation_types(annotation_types): """ Writes annotation types to log Parameters ---------- annotation_types : list a list of types of annotations in a corpus """ logging.info('Annotation type info') logging.info('--------------------') logging.info('') for a in annotation_types: logging.info(a.pretty_print()) def make_type_id(type_values, corpus): """ Construct a type ID from the type values and the corpus name Parameters ---------- type_values : list list of type values corpus : str the corpus Returns ------- str a hex string for the type ID """ m = hashlib.sha1() value = ' '.join(map(str, type_values)) value += ' ' + corpus m.update(value.encode()) return m.hexdigest() def guess_textgrid_format(path): """ Given a directory, tries to guess what format the TextGrid files are in Parameters ---------- path : str the path of the directory containing the TextGrid files Returns ------- str or None textgrid format or None if file is not textgrid and directory doesn't contain TextGrid files """ from .inspect import inspect_labbcat, inspect_mfa, inspect_fave, inspect_maus if os.path.isdir(path): counts = {'mfa': 0, 'labbcat': 0, 'fave': 0, 'maus': 0, None: 0} for root, subdirs, files in os.walk(path): for f in files: if not f.lower().endswith('.textgrid'): continue tg_path = os.path.join(root, f) try: tg = tgio.openTextgrid(tg_path) except ValueError as e: raise (TextGridError('The file {} could not be parsed: {}'.format(tg_path, str(e)))) labbcat_parser = inspect_labbcat(tg_path) mfa_parser = inspect_mfa(tg_path) fave_parser = inspect_fave(tg_path) maus_parser = inspect_maus(path) if labbcat_parser._is_valid(tg): counts['labbcat'] += 1 elif mfa_parser._is_valid(tg): counts['mfa'] += 1 elif fave_parser._is_valid(tg): counts['fave'] += 1 elif maus_parser._is_valid(tg): counts['maus'] += 1 else: counts[None] += 1 return max(counts.keys(), key=lambda x: counts[x]) elif path.lower().endswith('.textgrid'): try: tg = tgio.openTextgrid(path) except ValueError as e: raise (TextGridError('The file {} could not be parsed: {}'.format(path, str(e)))) labbcat_parser = inspect_labbcat(path) mfa_parser = inspect_mfa(path) fave_parser = inspect_fave(path) maus_parser = inspect_maus(path) if labbcat_parser._is_valid(tg): return 'labbcat' elif mfa_parser._is_valid(tg): return 'mfa' elif fave_parser._is_valid(tg): return 'fave' elif maus_parser._is_valid(tg): return 'maus' return None