Source code for polyglotdb.io.inspect.ilg

import os

from polyglotdb.io.helper import (
    calculate_lines_per_gloss,
    guess_trans_delimiter,
    guess_type,
    ilg_text_to_lines,
    most_frequent_value,
)
from polyglotdb.io.parsers import IlgParser
from polyglotdb.io.types.parsing import (
    TextMorphemeTier,
    TextOrthographyTier,
    TextTranscriptionTier,
)


[docs] def inspect_ilg(path, number=None): """ Generate an :class:`~polyglotdb.io.parsers.ilg.IlgParser` for a specified text file for parsing it as an interlinear gloss text file Parameters ---------- path : str Full path to text file number : int, optional Number of lines per gloss, if not supplied, it is auto-detected Returns ------- :class:`~polyglotdb.io.parsers.ilg.IlgParser` Autodetected parser for the text file """ trans_delimiters = [".", ";", ","] lines = {} if os.path.isdir(path): numbers = {} for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith(".txt"): continue p = os.path.join(root, filename) lines[p] = ilg_text_to_lines(p) numbers[p] = calculate_lines_per_gloss(lines[p]) number = most_frequent_value(numbers) else: lines[path] = ilg_text_to_lines(path) number = calculate_lines_per_gloss(lines[path]) p = path annotation_types = [] for i in range(number): labels = lines[p][i][1] cat = guess_type(labels, trans_delimiters) if i == 0 and cat == "orthography": a = TextOrthographyTier("word", "word") else: if cat == "transcription": a = TextTranscriptionTier("transcription", "word") a.trans_delimiter = guess_trans_delimiter(labels) elif cat == "morpheme": a = TextMorphemeTier("morpheme", "word") else: raise (NotImplementedError) annotation_types.append(a) for k, v in lines.items(): if k == p: continue for i in range(number): labels = lines[k][i][1] annotation_types[i].add(((x, j) for j, x in enumerate(labels)), save=False) return IlgParser(annotation_types)