import hashlib
import logging
import operator
import os
import wave
from collections import Counter
from praatio import textgrid
from polyglotdb.exceptions import DelimiterError, TextGridError
ATT_TYPES = ["orthography", "transcription", "numeric", "morpheme", "tobi", "grouping"]
tobi_characters = set("LH%-+!*")
morph_delimiters = set("-=")
def get_n_channels(file_path):
"""
Get the number of channels in an audio file
Parameters
----------
file_path : str
Path to audio file
Returns
-------
int
Number of channels
"""
with wave.open(file_path, "rb") as soundf:
n_channels = soundf.getnchannels()
return n_channels
def normalize_values_for_neo4j(dictionary):
"""
Sanitizes dictionary for neo4j format by making non-existent values be the string 'NULL'
Parameters
----------
dictionary : dict
the dictionary to be sanitized
Returns
-------
dict
sanitized dictionary
"""
out = {}
for k, v in dictionary.items():
if isinstance(v, list):
v = ".".join(map(str, v))
if not v:
v = "NULL"
out[k] = v
return out
def guess_type(values, trans_delimiters=None):
"""
Given a set of values, guesses the value type (numeric, transcription, grouping, tobi, morpheme, orthography)
Parameters
----------
values : dict
a dictionary of the possible values
trans_delimiters : list
List of transcription delimiters, optional
Returns
-------
str
most probable type (highest count)
"""
if trans_delimiters is None:
trans_delimiters = [".", " ", ";", ","]
probable_values = {x: 0 for x in ATT_TYPES}
for i, v in enumerate(values):
try:
_ = float(v)
probable_values["numeric"] += 1
continue
except ValueError:
for d in trans_delimiters:
if d in v:
probable_values["transcription"] += 1
break
else:
if v == "":
probable_values["grouping"] += 1
elif set(v).issubset(tobi_characters):
probable_values["tobi"] += 1
elif len(set(v) & morph_delimiters) > 0:
probable_values["morpheme"] += 1
else:
probable_values["orthography"] += 1
if probable_values["orthography"] > 0:
del probable_values["grouping"]
return max(probable_values.items(), key=operator.itemgetter(1))[0]
def guess_trans_delimiter(values):
""" "
Given a set of values, guess the transition delimiter
Parameters
----------
values : dict
a dictionary of the possible values
Returns
-------
str
the most probable delimiter (highest count)
"""
trans_delimiters = [".", " ", ";", ","]
probable_values = {x: 0 for x in trans_delimiters}
for v in values:
for delim in trans_delimiters:
if delim in v:
probable_values[delim] += 1
return max(probable_values.items(), key=operator.itemgetter(1))[0]
[docs]
def inspect_directory(directory):
"""
Function to inspect a directory and return the most likely type of
files within it.
Searches currently for 'textgrid', 'text', 'buckeye', 'timit', and 'partitur' file
types.
Parameters
----------
directory : str
Full path to the directory
Returns
-------
str
Most likely type of files
dict
Dictionary of the found files separated by the types searched for
"""
types = ["textgrid", "text", "buckeye", "timit", "partitur"]
counter = {x: 0 for x in types}
relevant_files = {x: [] for x in types}
for root, subdirs, files in os.walk(directory):
for f in files:
ext = os.path.splitext(f)[-1].lower()
if ext == ".textgrid":
t = "textgrid"
elif ext == ".txt":
t = "text"
elif ext == ".words":
t = "buckeye"
elif ext == ".wrd":
t = "timit"
elif ext == ".par,2":
t = "partitur"
else:
continue
counter[t] += 1
relevant_files[t].append(f)
max_value = max(counter.values())
for t in ["textgrid", "buckeye", "timit", "text", "partitur"]:
if counter[t] == max_value:
likely_type = t
break
return likely_type, relevant_files
[docs]
def text_to_lines(path):
"""
Parse a text file into lines.
Parameters
----------
path : str
Fully specified path to text file
Returns
-------
list
Non-empty lines in the text file
"""
delimiter = None
with open(path, encoding="utf-8-sig", mode="r") as f:
text = f.read()
if delimiter is not None and delimiter not in text:
e = DelimiterError(
"The delimiter specified does not create multiple words. Please specify another delimiter."
)
raise (e)
lines = [x.strip().split(delimiter) for x in text.splitlines() if x.strip() != ""]
return lines
def most_frequent_value(dictionary):
"""
Gets the most frequent value in the dictionary
Parameters
----------
dictionary : dict
The dictionary to search through
Returns
-------
object
the most frequent value
"""
c = Counter(dictionary.values())
return max(c.keys(), key=lambda x: c[x])
def calculate_lines_per_gloss(lines):
"""
Calculates lines per gloss of lines
Parameters
----------
lines : list
lines in the corpus
Returns
-------
int
the count of lines per gloss
"""
line_counts = [len(x[1]) for x in lines]
equaled = []
number = 1
for i, line in enumerate(line_counts):
if i == 0:
equaled.append(False)
else:
equaled.append(line == line_counts[i - 1])
if False not in equaled[1:]:
# All lines happen to have the same length
for i in range(2, 6):
if len(lines) % i == 0:
number = i
else:
false_intervals = []
ind = 0
for i, e in enumerate(equaled):
if i == 0:
continue
if not e:
false_intervals.append(i - ind)
ind = i
false_intervals.append(i + 1 - ind)
counter = Counter(false_intervals)
number = max(counter.keys(), key=lambda x: (counter[x], x))
if number > 10:
prev_maxes = {number}
while number > 10:
prev_maxes.add(number)
number = max(x for x in false_intervals if x not in prev_maxes)
return number
def ilg_text_to_lines(path):
"""
Converts an ilg file to text lines
Parameters
----------
path : string
path to ilg file
Returns
-------
list
a sanitized list of lines in the file
"""
delimiter = None
with open(path, encoding="utf-8-sig", mode="r") as f:
text = f.read()
if delimiter is not None and delimiter not in text:
e = DelimiterError(
"The delimiter specified does not create multiple words. Please specify another delimiter."
)
raise (e)
lines = enumerate(text.splitlines())
lines = [(x[0], x[1].strip().split(delimiter)) for x in lines if x[1].strip() != ""]
return lines
[docs]
def find_wav_path(path):
"""
Find a sound file for a given file, by looking for a .wav file with the
same base name as the given path
Parameters
----------
path : str
Full path to an annotation file
Returns
-------
str or None
Full path of the wav file if it exists or None if it does not
"""
name, ext = os.path.splitext(path)
wav_path = name + ".wav"
if os.path.exists(wav_path):
return wav_path
wav_path = name + ".WAV"
if os.path.exists(wav_path):
return wav_path
return None
def log_annotation_types(annotation_types):
"""
Writes annotation types to log
Parameters
----------
annotation_types : list
a list of types of annotations in a corpus
"""
logging.info("Annotation type info")
logging.info("--------------------")
logging.info("")
for a in annotation_types:
logging.info(a.pretty_print())
def make_type_id(type_values, corpus):
"""
Construct a type ID from the type values and the corpus name
Parameters
----------
type_values : list
list of type values
corpus : str
the corpus
Returns
-------
str
a hex string for the type ID
"""
m = hashlib.sha1()
value = " ".join(map(str, type_values))
value += " " + corpus
m.update(value.encode())
return m.hexdigest()
def guess_textgrid_format(path):
"""
Given a directory, tries to guess what format the TextGrid files are in
Parameters
----------
path : str
the path of the directory containing the TextGrid files
Returns
-------
str or None
textgrid format or None if file is not textgrid and directory doesn't contain TextGrid files
"""
from .inspect import inspect_fave, inspect_labbcat, inspect_maus, inspect_mfa
if os.path.isdir(path):
counts = {"mfa": 0, "labbcat": 0, "fave": 0, "maus": 0, None: 0}
for root, subdirs, files in os.walk(path):
for f in files:
if not f.lower().endswith(".textgrid"):
continue
tg_path = os.path.join(root, f)
try:
tg = textgrid.openTextgrid(tg_path, includeEmptyIntervals=True)
except ValueError as e:
raise (
TextGridError(
"The file {} could not be parsed: {}".format(tg_path, str(e))
)
)
labbcat_parser = inspect_labbcat(tg_path)
mfa_parser = inspect_mfa(tg_path)
fave_parser = inspect_fave(tg_path)
maus_parser = inspect_maus(path)
if labbcat_parser._is_valid(tg):
counts["labbcat"] += 1
elif mfa_parser._is_valid(tg):
counts["mfa"] += 1
elif fave_parser._is_valid(tg):
counts["fave"] += 1
elif maus_parser._is_valid(tg):
counts["maus"] += 1
else:
counts[None] += 1
return max(counts.keys(), key=lambda x: counts[x])
elif path.lower().endswith(".textgrid"):
try:
tg = textgrid.openTextgrid(path, includeEmptyIntervals=True)
except ValueError as e:
raise (TextGridError("The file {} could not be parsed: {}".format(path, str(e))))
labbcat_parser = inspect_labbcat(path)
mfa_parser = inspect_mfa(path)
fave_parser = inspect_fave(path)
maus_parser = inspect_maus(path)
if labbcat_parser._is_valid(tg):
return "labbcat"
elif mfa_parser._is_valid(tg):
return "mfa"
elif fave_parser._is_valid(tg):
return "fave"
elif maus_parser._is_valid(tg):
return "maus"
return None