import os
import logging
import operator
import hashlib
import wave
from collections import Counter
from praatio import tgio
from polyglotdb.exceptions import DelimiterError, TextGridError
ATT_TYPES = ['orthography', 'transcription', 'numeric',
'morpheme', 'tobi', 'grouping']
tobi_characters = set('LH%-+!*')
morph_delimiters = set('-=')
def get_n_channels(file_path):
"""
Get the number of channels in an audio file
Parameters
----------
file_path : str
Path to audio file
Returns
-------
int
Number of channels
"""
with wave.open(file_path, 'rb') as soundf:
n_channels = soundf.getnchannels()
return n_channels
def normalize_values_for_neo4j(dictionary):
"""
Sanitizes dictionary for neo4j format by making non-existent values be the string 'NULL'
Parameters
----------
dictionary : dict
the dictionary to be sanitized
Returns
-------
dict
sanitized dictionary
"""
out = {}
for k, v in dictionary.items():
if isinstance(v, list):
v = '.'.join(map(str, v))
if not v:
v = 'NULL'
out[k] = v
return out
def guess_type(values, trans_delimiters=None):
"""
Given a set of values, guesses the value type (numeric, transcription, grouping, tobi, morpheme, orthography)
Parameters
----------
values : dict
a dictionary of the possible values
trans_delimiters : list
List of transcription delimiters, optional
Returns
-------
str
most probable type (highest count)
"""
if trans_delimiters is None:
trans_delimiters = ['.', ' ', ';', ',']
probable_values = {x: 0 for x in ATT_TYPES}
for i, v in enumerate(values):
try:
t = float(v)
probable_values['numeric'] += 1
continue
except ValueError:
for d in trans_delimiters:
if d in v:
probable_values['transcription'] += 1
break
else:
if v == '':
probable_values['grouping'] += 1
elif set(v).issubset(tobi_characters):
probable_values['tobi'] += 1
elif len(set(v) & morph_delimiters) > 0:
probable_values['morpheme'] += 1
else:
probable_values['orthography'] += 1
if probable_values['orthography'] > 0:
del probable_values['grouping']
return max(probable_values.items(), key=operator.itemgetter(1))[0]
def guess_trans_delimiter(values):
""""
Given a set of values, guess the transition delimiter
Parameters
----------
values : dict
a dictionary of the possible values
Returns
-------
str
the most probable delimiter (highest count)
"""
trans_delimiters = ['.', ' ', ';', ',']
probable_values = {x: 0 for x in trans_delimiters}
for l in values:
for delim in trans_delimiters:
if delim in l:
probable_values[delim] += 1
return max(probable_values.items(), key=operator.itemgetter(1))[0]
[docs]def inspect_directory(directory):
"""
Function to inspect a directory and return the most likely type of
files within it.
Searches currently for 'textgrid', 'text', 'buckeye', 'timit', and 'partitur' file
types.
Parameters
----------
directory : str
Full path to the directory
Returns
-------
str
Most likely type of files
dict
Dictionary of the found files separated by the types searched for
"""
types = ['textgrid', 'text', 'buckeye', 'timit', 'partitur']
counter = {x: 0 for x in types}
relevant_files = {x: [] for x in types}
for root, subdirs, files in os.walk(directory):
for f in files:
ext = os.path.splitext(f)[-1].lower()
if ext == '.textgrid':
t = 'textgrid'
elif ext == '.txt':
t = 'text'
elif ext == '.words':
t = 'buckeye'
elif ext == '.wrd':
t = 'timit'
elif ext == '.par,2':
t = 'partitur'
else:
continue
counter[t] += 1
relevant_files[t].append(f)
max_value = max(counter.values())
for t in ['textgrid', 'buckeye', 'timit', 'text', 'partitur']:
if counter[t] == max_value:
likely_type = t
break
return likely_type, relevant_files
[docs]def text_to_lines(path):
"""
Parse a text file into lines.
Parameters
----------
path : str
Fully specified path to text file
Returns
-------
list
Non-empty lines in the text file
"""
delimiter = None
with open(path, encoding='utf-8-sig', mode='r') as f:
text = f.read()
if delimiter is not None and delimiter not in text:
e = DelimiterError(
'The delimiter specified does not create multiple words. Please specify another delimiter.')
raise (e)
lines = [x.strip().split(delimiter) for x in text.splitlines() if x.strip() != '']
return lines
def most_frequent_value(dictionary):
"""
Gets the most frequent value in the dictionary
Parameters
----------
dictionary : dict
The dictionary to search through
Returns
-------
object
the most frequent value
"""
c = Counter(dictionary.values())
return max(c.keys(), key=lambda x: c[x])
def calculate_lines_per_gloss(lines):
"""
Calculates lines per gloss of lines
Parameters
----------
lines : list
lines in the corpus
Returns
-------
int
the count of lines per gloss
"""
line_counts = [len(x[1]) for x in lines]
equaled = list()
number = 1
for i, line in enumerate(line_counts):
if i == 0:
equaled.append(False)
else:
equaled.append(line == line_counts[i - 1])
if False not in equaled[1:]:
# All lines happen to have the same length
for i in range(2, 6):
if len(lines) % i == 0:
number = i
else:
false_intervals = list()
ind = 0
for i, e in enumerate(equaled):
if i == 0:
continue
if not e:
false_intervals.append(i - ind)
ind = i
false_intervals.append(i + 1 - ind)
counter = Counter(false_intervals)
number = max(counter.keys(), key=lambda x: (counter[x], x))
if number > 10:
prev_maxes = set([number])
while number > 10:
prev_maxes.add(number)
number = max(x for x in false_intervals if x not in prev_maxes)
return number
def ilg_text_to_lines(path):
"""
Converts an ilg file to text lines
Parameters
----------
path : string
path to ilg file
Returns
-------
list
a sanitized list of lines in the file
"""
delimiter = None
with open(path, encoding='utf-8-sig', mode='r') as f:
text = f.read()
if delimiter is not None and delimiter not in text:
e = DelimiterError(
'The delimiter specified does not create multiple words. Please specify another delimiter.')
raise (e)
lines = enumerate(text.splitlines())
lines = [(x[0], x[1].strip().split(delimiter)) for x in lines if x[1].strip() != '']
return lines
[docs]def find_wav_path(path):
"""
Find a sound file for a given file, by looking for a .wav file with the
same base name as the given path
Parameters
----------
path : str
Full path to an annotation file
Returns
-------
str or None
Full path of the wav file if it exists or None if it does not
"""
name, ext = os.path.splitext(path)
wav_path = name + '.wav'
if os.path.exists(wav_path):
return wav_path
wav_path = name + '.WAV'
if os.path.exists(wav_path):
return wav_path
return None
def log_annotation_types(annotation_types):
"""
Writes annotation types to log
Parameters
----------
annotation_types : list
a list of types of annotations in a corpus
"""
logging.info('Annotation type info')
logging.info('--------------------')
logging.info('')
for a in annotation_types:
logging.info(a.pretty_print())
def make_type_id(type_values, corpus):
"""
Construct a type ID from the type values and the corpus name
Parameters
----------
type_values : list
list of type values
corpus : str
the corpus
Returns
-------
str
a hex string for the type ID
"""
m = hashlib.sha1()
value = ' '.join(map(str, type_values))
value += ' ' + corpus
m.update(value.encode())
return m.hexdigest()
def guess_textgrid_format(path):
"""
Given a directory, tries to guess what format the TextGrid files are in
Parameters
----------
path : str
the path of the directory containing the TextGrid files
Returns
-------
str or None
textgrid format or None if file is not textgrid and directory doesn't contain TextGrid files
"""
from .inspect import inspect_labbcat, inspect_mfa, inspect_fave, inspect_maus
if os.path.isdir(path):
counts = {'mfa': 0, 'labbcat': 0, 'fave': 0, 'maus': 0, None: 0}
for root, subdirs, files in os.walk(path):
for f in files:
if not f.lower().endswith('.textgrid'):
continue
tg_path = os.path.join(root, f)
try:
tg = tgio.openTextgrid(tg_path)
except ValueError as e:
raise (TextGridError('The file {} could not be parsed: {}'.format(tg_path, str(e))))
labbcat_parser = inspect_labbcat(tg_path)
mfa_parser = inspect_mfa(tg_path)
fave_parser = inspect_fave(tg_path)
maus_parser = inspect_maus(path)
if labbcat_parser._is_valid(tg):
counts['labbcat'] += 1
elif mfa_parser._is_valid(tg):
counts['mfa'] += 1
elif fave_parser._is_valid(tg):
counts['fave'] += 1
elif maus_parser._is_valid(tg):
counts['maus'] += 1
else:
counts[None] += 1
return max(counts.keys(), key=lambda x: counts[x])
elif path.lower().endswith('.textgrid'):
try:
tg = tgio.openTextgrid(path)
except ValueError as e:
raise (TextGridError('The file {} could not be parsed: {}'.format(path, str(e))))
labbcat_parser = inspect_labbcat(path)
mfa_parser = inspect_mfa(path)
fave_parser = inspect_fave(path)
maus_parser = inspect_maus(path)
if labbcat_parser._is_valid(tg):
return 'labbcat'
elif mfa_parser._is_valid(tg):
return 'mfa'
elif fave_parser._is_valid(tg):
return 'fave'
elif maus_parser._is_valid(tg):
return 'maus'
return None