Source code for polyglotdb.io.discoursedata

[docs]class DiscourseData(object): """ Class for collecting information about a discourse to be loaded Parameters ---------- name : str Identifier for the discourse annotation_types : list List of :class:`BaseAnnotationType` objects hierarchy : :class:`~polyglotdb.structure.Hierarchy` Details of how linguistic types relate to one another Attributes ---------- name : str Identifier for the discourse data : dict Dictionary containing :class:`BaseAnnotationType` objects indexed by their name segment_type : str or None Identifier of the segment linguistic annotation, if it exists wav_path : str or None Path to sound file if it exists """ def __init__(self, name, annotation_types, hierarchy): self.name = name self.data = annotation_types self.speaker_channel_mapping = {} self.segment_type = None for k, v in self.data.items(): if k not in hierarchy.values() and not v.is_word: self.segment_type = k self.hierarchy = hierarchy self.wav_path = None for k, at in self.data.items(): self.hierarchy.type_properties[at.name] = at.type_properties self.hierarchy.type_properties[at.name].add(('id', type(''))) self.hierarchy.type_properties[at.name].add(('label', type(''))) if not at.token_properties: self.hierarchy.token_properties[at.name] = set((x, type(None)) for x in at.token_property_keys if x not in ['id', 'label', 'begin', 'end']) else: self.hierarchy.token_properties[at.name] = at.token_properties self.hierarchy.token_properties[at.name].add(('id', type(''))) self.hierarchy.token_properties[at.name].add(('label', type(''))) self.hierarchy.token_properties[at.name].add(('begin', type(0.0))) self.hierarchy.token_properties[at.name].add(('end', type(0.0))) def __getitem__(self, key): return self.data[key] def __contains__(self, item): return item in self.data
[docs] def highest_to_lowest(self): """ orders hierarchy highest to lowest Returns ------- ats : dict the ordered hierarchy """ ats = [] for k, v in self.hierarchy.items(): if v is None: ats.append(k) break while len(ats) < len(self.hierarchy.keys()): for k, v in self.hierarchy.items(): if v == ats[-1]: ats.append(k) break return ats
@property def token_headers(self): """ Get the headers for the CSV file for importing annotation tokens Returns ------- list Token headers """ headers = {} for x in self.annotation_types: token_header = ['begin', 'end', 'type_id', 'id', 'previous_id', 'speaker', 'discourse', 'label'] token_header += sorted( y[0] for y in self.hierarchy.token_properties[x] if y[0] not in ['label', 'begin', 'end', 'id']) supertype = self[x].supertype if supertype is not None: token_header.append(supertype) headers[x] = token_header return headers @property def speakers(self): """ Returns speakers from a discourse """ speakers = set() for x in self.values(): speakers.update(x.speakers) return sorted(speakers) @property def annotation_types(self): """ Returns corpus annotation types""" return self.keys()
[docs] def keys(self): """ Returns corpus keys""" return self.data.keys()
[docs] def values(self): """ Returns tuple of values in corpus""" return (self.data[x] for x in self.keys())
[docs] def items(self): """ Returns tuple of items in corpus""" return ((x, self.data[x]) for x in self.keys())
[docs] def types(self, corpus_name): """ Get all the types in the discourse and return them along with header information Parameters ---------- corpus_name : str the name of the corpus Returns ------- dict Type data list Type headers """ types = {} type_headers = {} for k, v in self.items(): types[k] = set() for w in v: if k not in type_headers: type_headers[k] = ['id'] + w.type_keys() id = w.sha(corpus_name) props = tuple([id] + [x for x in w.type_values()]) types[k].add(props) return types, type_headers