import os
from polyglotdb.io.helper import (
calculate_lines_per_gloss,
guess_trans_delimiter,
guess_type,
ilg_text_to_lines,
most_frequent_value,
)
from polyglotdb.io.parsers import IlgParser
from polyglotdb.io.types.parsing import (
TextMorphemeTier,
TextOrthographyTier,
TextTranscriptionTier,
)
[docs]
def inspect_ilg(path, number=None):
"""
Generate an :class:`~polyglotdb.io.parsers.ilg.IlgParser`
for a specified text file for parsing it as an interlinear gloss text file
Parameters
----------
path : str
Full path to text file
number : int, optional
Number of lines per gloss, if not supplied, it is auto-detected
Returns
-------
:class:`~polyglotdb.io.parsers.ilg.IlgParser`
Autodetected parser for the text file
"""
trans_delimiters = [".", ";", ","]
lines = {}
if os.path.isdir(path):
numbers = {}
for root, subdirs, files in os.walk(path):
for filename in files:
if not filename.lower().endswith(".txt"):
continue
p = os.path.join(root, filename)
lines[p] = ilg_text_to_lines(p)
numbers[p] = calculate_lines_per_gloss(lines[p])
number = most_frequent_value(numbers)
else:
lines[path] = ilg_text_to_lines(path)
number = calculate_lines_per_gloss(lines[path])
p = path
annotation_types = []
for i in range(number):
labels = lines[p][i][1]
cat = guess_type(labels, trans_delimiters)
if i == 0 and cat == "orthography":
a = TextOrthographyTier("word", "word")
else:
if cat == "transcription":
a = TextTranscriptionTier("transcription", "word")
a.trans_delimiter = guess_trans_delimiter(labels)
elif cat == "morpheme":
a = TextMorphemeTier("morpheme", "word")
else:
raise (NotImplementedError)
annotation_types.append(a)
for k, v in lines.items():
if k == p:
continue
for i in range(number):
labels = lines[k][i][1]
annotation_types[i].add(((x, j) for j, x in enumerate(labels)), save=False)
return IlgParser(annotation_types)