Source code for polyglotdb.io.parsers.base


from ..types.standardized import PGAnnotation, PGSubAnnotation, PGAnnotationType

from ..types.parsing import Tobi, BreakIndex

from ..discoursedata import DiscourseData

from ...exceptions import ParseError


[docs] class BaseParser(object): """ Base parser, extend this class for new parsers. Parameters ---------- annotation_tiers: list Annotation types of the files to parse hierarchy : :class:`~polyglotdb.structure.Hierarchy` Details of how linguistic types relate to one another make_transcription : bool, defaults to True If true, create a word attribute for transcription based on segments that are contained by the word stop_check : callable, optional Function to check whether to halt parsing call_back : callable, optional Function to output progress messages """ _extensions = ['.txt'] def __init__(self, annotation_tiers, hierarchy, make_transcription=True, make_label=False, stop_check=None, call_back=None): self.speaker_parser = None self.annotation_tiers = annotation_tiers self.hierarchy = hierarchy self.make_transcription = make_transcription self.make_label = make_label self.stop_check = stop_check self.call_back = call_back def match_extension(self, filename): """ Ensures that filename ends with acceptable extension Parameters ---------- filename : str the filename of the file being checked Returns ------- boolean True if filename is acceptable extension, false otherwise """ for x in self._extensions: if filename.lower().endswith(x): break else: return False return True def _parse_annotations(self, types_only=False): annotation_tiers = {} segment_type = None for k, v in self.hierarchy.items(): annotation_tiers[k] = PGAnnotationType(k) annotation_tiers[k].supertype = v if 'word' in k: annotation_tiers[k].is_word = True # FIXME? self.hierarchy.type_properties['word'] = set() self.hierarchy.token_properties['word'] = set() if k not in self.hierarchy.values() and not annotation_tiers[k].is_word: segment_type = k for k in annotation_tiers.keys(): relevent_levels = {} lengths = {} for inputlevel in self.annotation_tiers: if inputlevel.ignored: continue if inputlevel.linguistic_type != k: continue speaker = inputlevel.speaker if speaker not in relevent_levels: relevent_levels[speaker] = [] if speaker not in lengths: lengths[speaker] = 0 relevent_levels[speaker].append(inputlevel) if inputlevel.subannotation: continue if lengths[speaker] == 0: lengths[speaker] = len(inputlevel) elif lengths[speaker] != len(inputlevel): raise ( ParseError('Annotations sharing a linguistic type and a speaker don\'t have a consistent length.')) for speaker, speaker_levels in relevent_levels.items(): for i in range(lengths[speaker]): type_properties = {} token_properties = {} label = None begin = None end = None for rl in speaker_levels: if types_only and not rl.type_property: annotation_tiers[k].token_property_keys.add(rl.name) continue if rl.subannotation: continue if begin is None: try: begin = rl[i].begin except AttributeError: try: begin = rl[i].time except AttributeError: pass if end is None: try: end = rl[i].end except AttributeError: try: end = rl[i].time except AttributeError: pass if rl.name == k or rl.name == 'label' or rl.label: if rl[i].value == '': label = '<SIL>' elif rl[i].value is not None: label = rl[i].value elif rl.type_property: if False and not types_only: print(rl.name, 'is type!') type_properties[rl.name] = rl[i].value else: if False and not types_only: print(rl.name, 'is token!') token_properties[rl.name] = rl[i].value a = PGAnnotation(label, begin, end) a.type_properties.update(type_properties) a.token_properties.update(token_properties) a.speaker = speaker if i != 0: a.previous_id = annotation_tiers[k][-1].id annotation_tiers[k].add(a) for rl in speaker_levels: if types_only: continue if not rl.subannotation: continue for sub in rl: #TODO: Maybe will cause VOTs to be under wrong phone. annotation = annotation_tiers[k].lookup(sub.midpoint, speaker=speaker) if isinstance(sub, Tobi): a = PGSubAnnotation(sub.label, 'tone', sub.begin, sub.end) elif isinstance(sub, BreakIndex): a = PGSubAnnotation(sub.value, 'break', sub.begin, sub.end) else: a = PGSubAnnotation(None, sub.label, sub.begin, sub.end) annotation.subannotations.append(a) if k not in self.hierarchy.subannotations: self.hierarchy.subannotations[k] = set() self.hierarchy.subannotations[k].add(a.type) for k, v in annotation_tiers.items(): annotation_tiers[k].optimize_lookups() if not types_only: st = v.supertype if st is not None: annotation_tiers[st].optimize_lookups() for a in annotation_tiers[k]: super_annotation = annotation_tiers[st].lookup(a.midpoint, speaker=a.speaker) try: a.super_id = super_annotation.id except AttributeError: pass # raise if self.make_transcription and segment_type is not None and v.is_word: v.type_property_keys.update(['transcription']) annotation_tiers[segment_type].optimize_lookups() for a in annotation_tiers[k]: transcription = annotation_tiers[segment_type].lookup_range(a.begin, a.end, speaker=a.speaker) a.type_properties['transcription'] = [x.label for x in transcription] v.type_properties |= set([(tuple(['transcription', type("string")]))]) self.hierarchy.type_properties['word'] |= set([(tuple(['transcription', type("string")]))]) if self.make_label and 'transcription' in v.type_property_keys and v.is_word: for a in annotation_tiers[k]: if a.label is None: a.label = ''.join(a.type_properties['transcription']) annotation_tiers[k].type_property_keys.add('label') annotation_tiers[k].token_property_keys.add('label') return annotation_tiers def parse_information(self, path, corpus_name): """ Parses types out of a corpus Parameters ---------- path : str a path to the corpus corpus_name : str name of the corpus Returns ------- data.types : list a list of data types """ data = self.parse_discourse(path, types_only=True) return_dict = {} return_dict['types'], return_dict['type_headers'] = data.types(corpus_name) return_dict['token_headers'] = data.token_headers return_dict['subannotations'] = data.hierarchy.subannotations return_dict['speakers'] = data.speakers return return_dict def parse_discourse(self, name, types_only=False): """ Parse annotations for later importing. Parameters ---------- name : str Name of the discourse types_only : bool Flag for whether to only save type information, ignoring the token information Returns ------- :class:`~polyglotdb.io.discoursedata.DiscourseData` Parsed data """ pg_annotations = self._parse_annotations(types_only) data = DiscourseData(name, pg_annotations, self.hierarchy) return data