Source code for polyglotdb.corpus.syllabic

from uuid import uuid1

import re
from ..io.importer import (syllables_data_to_csvs, import_syllable_csv,
                           nonsyls_data_to_csvs, import_nonsyl_csv,
                           create_syllabic_csvs, create_nonsyllabic_csvs,
                           syllables_enrichment_data_to_csvs, import_syllable_enrichment_csvs)

from ..io.helper import make_type_id

from ..syllabification.probabilistic import norm_count_dict, split_nonsyllabic_prob, split_ons_coda_prob
from ..syllabification.maxonset import split_nonsyllabic_maxonset, split_ons_coda_maxonset
from .utterance import UtteranceContext


def make_label_safe_for_cypher(label):
    """
    Make a given subset name safe for use in Cypher

    Parameters
    ----------
    label : str
        Subset name

    Returns
    -------
    str
        Cypher-safe name
    """
    if not label.startswith('`'):
        label = '`' + label
    if not label.endswith('`'):
        label += '`'
    return label


[docs] class SyllabicContext(UtteranceContext): """ Class that contains methods for dealing specifically with syllables """ def find_onsets(self, syllabic_label='syllabic'): """ Gets syllable onsets across the corpus Parameters ---------- syllabic_label : str Subset to use for syllabic segments (i.e., nuclei) Returns ------- data : dict A dictionary with onset values as keys and frequency values as values """ from collections import Counter data = Counter() for s in self.speakers: discourses = self.get_discourses_of_speaker(s) for d in discourses: statement = '''match (w:{word_name}:{corpus_name})-[:spoken_by]->(s:Speaker:{corpus_name}), (w)-[:spoken_in]->(d:Discourse:{corpus_name}) where (w)<-[:contained_by*1..2]-()-[:is_a]->(:{syllabic_name}) AND s.name = $speaker AND d.name = $discourse with w match (n:{phone_name}:{corpus_name})-[:is_a]->(t:{corpus_name}:{syllabic_name}), (n)-[:contained_by*1..2]->(w) with w, n order by n.begin with w,collect(n)[0..1] as coll unwind coll as n MATCH (pn:{phone_name}:{corpus_name})-[:contained_by*1..2]->(w) where not (pn)<-[:precedes]-()-[:contained_by*1..2]->(w) with w, n,pn match p = shortestPath((pn)-[:precedes*0..10]->(n)) with [x in nodes(p)[0..-1]|x.label] as onset return onset, count(onset) as freq'''.format(corpus_name=self.cypher_safe_name, word_name=self.word_name, syllabic_name=make_label_safe_for_cypher(syllabic_label), phone_name=self.phone_name) res = self.execute_cypher(statement, speaker=s, discourse=d) for r in res: data[tuple(r['onset'])] += r['freq'] return data def find_codas(self, syllabic_label='syllabic'): """ Gets syllable codas across the corpus Parameters ---------- syllabic_label : str Subset to use for syllabic segments (i.e., nuclei) Returns ------- data : dict A dictionary with coda values as keys and frequency values as values """ from collections import Counter data = Counter() for s in self.speakers: discourses = self.get_discourses_of_speaker(s) for d in discourses: statement = '''match (w:{word_name}:{corpus_name})-[:spoken_by]->(s:Speaker:{corpus_name}), (w)-[:spoken_in]->(d:Discourse:{corpus_name}) where (w)<-[:contained_by*1..2]-()-[:is_a]->(:{syllabic_name}) AND s.name = $speaker AND d.name = $discourse with w match (n:{phone_name}:{corpus_name})-[:is_a]->(t:{corpus_name}:{syllabic_name}), (n)-[:contained_by*1..2]->(w) with w, n order by n.begin DESC with w,collect(n)[0..1] as coll unwind coll as n MATCH (pn:{phone_name}:{corpus_name})-[:contained_by*1..2]->(w) where not (pn)-[:precedes]->()-[:contained_by*1..2]->(w) with w, n,pn match p = shortestPath((n)-[:precedes*0..10]->(pn)) with [x in nodes(p)[1..]|x.label] as coda return coda, count(coda) as freq'''.format(corpus_name=self.cypher_safe_name, word_name=self.word_name, syllabic_name=make_label_safe_for_cypher(syllabic_label), phone_name=self.phone_name) res = self.execute_cypher(statement, speaker=s, discourse=d) for r in res: data[tuple(r['coda'])] += r['freq'] return data def encode_syllabic_segments(self, phones): """ Encode a list of phones as 'syllabic' Parameters ---------- phones : list A list of vowels and syllabic consonants """ self.encode_class(phones, 'syllabic') def reset_syllables(self, call_back=None, stop_check=None): """ Resets syllables, removes syllable annotation, removes onset, coda, and nucleus labels Parameters ---------- call_back : callable Function to monitor progress stop_check : callable Function the check whether the process should terminate early """ if call_back is not None: call_back('Resetting syllables...') number = self.execute_cypher( '''MATCH (n:syllable:%s) return count(*) as number ''' % self.cypher_safe_name)[0]['number'] call_back(0, number) for s in self.speakers: discourses = self.get_discourses_of_speaker(s) for d in discourses: phone_rel_statement = ''' MATCH (p:{phone_name}:{corpus})-[:contained_by]->(s:syllable:{corpus}), (s)-[:contained_by]->(w:{word_name}:{corpus}), (s)-[:spoken_by]->(sp:Speaker:{corpus}), (s)-[:spoken_in]->(d:Discourse:{corpus}) WHERE sp.name = $speaker_name AND d.name = $discourse_name with p,w CREATE (p)-[:contained_by]->(w) '''.format(corpus=self.cypher_safe_name, word_name=self.word_name, phone_name=self.phone_name) self.execute_cypher(phone_rel_statement, speaker_name=s, discourse_name=d) phone_label_statement = ''' MATCH (p:{phone_name}:{corpus})-[:spoken_by]->(sp:Speaker:{corpus}), (p)-[:spoken_in]->(d:Discourse:{corpus}) WHERE sp.name = $speaker_name AND d.name = $discourse_name with p REMOVE p:onset, p:nucleus, p:coda, p.syllable_position '''.format(corpus=self.cypher_safe_name, word_name=self.word_name, phone_name=self.phone_name) self.execute_cypher(phone_label_statement, speaker_name=s, discourse_name=d) num_deleted = 0 deleted = 1000 delete_statement = ''' MATCH (s:syllable:{corpus})-[:spoken_by]->(sp:Speaker:{corpus}), (s)-[:spoken_in]->(d:Discourse:{corpus}) WHERE sp.name = $speaker_name AND d.name = $discourse_name WITH s LIMIT 1000 DETACH DELETE s RETURN count(s) as deleted_count '''.format(corpus=self.cypher_safe_name) while deleted > 0: if stop_check is not None and stop_check(): break deleted = self.execute_cypher(delete_statement, speaker_name=s, discourse_name=d)[0][ 'deleted_count'] num_deleted += deleted if call_back is not None: call_back(num_deleted) statement = '''MATCH (st:syllable_type:{corpus}) WITH st DETACH DELETE st'''.format(corpus=self.cypher_safe_name) self.execute_cypher(statement) try: self.hierarchy.remove_annotation_type('syllable') self.hierarchy.remove_token_subsets(self, self.phone_name, ['onset', 'coda', 'nucleus']) self.hierarchy.remove_token_properties(self, self.phone_name, ['syllable_position']) # self.reset_to_old_label() self.encode_hierarchy() except KeyError: pass @property def has_syllabics(self): """ Check whether there is a phone subset named ``syllabic`` Returns ------- bool True if ``syllabic`` is found as a phone subset """ return 'syllabic' in self.hierarchy.subset_types[self.phone_name] @property def has_syllables(self): """ Check whether the corpus has syllables encoded Returns ------- bool True if the syllables are in the Hierarchy """ return 'syllable' in self.hierarchy.annotation_types def encode_syllables(self, algorithm='maxonset', syllabic_label='syllabic', call_back=None, stop_check=None): """ Encodes syllables to a corpus Parameters ---------- algorithm : str, defaults to 'maxonset' determines which algorithm will be used to encode syllables syllabic_label : str Subset to use for syllabic segments (i.e., nuclei) call_back : callable Function to monitor progress stop_check : callable Function the check whether the process should terminate early """ self.reset_syllables(call_back, stop_check) onsets = self.find_onsets(syllabic_label=syllabic_label) if algorithm == 'probabilistic': onsets = norm_count_dict(onsets, onset=True) codas = self.find_codas(syllabic_label=syllabic_label) codas = norm_count_dict(codas, onset=False) elif algorithm == 'maxonset': onsets = set(onsets.keys()) else: raise NotImplementedError statement = '''MATCH (n:{}:{}) return n.label as label'''.format(self.cypher_safe_name, make_label_safe_for_cypher(syllabic_label)) res = self.execute_cypher(statement) syllabics = set(x['label'] for x in res) word_type = getattr(self, self.word_name) phone_type = getattr(word_type, self.phone_name) create_syllabic_csvs(self) create_nonsyllabic_csvs(self) splits = self.speakers process_string = 'Processing speaker {} of {} ({})...' if call_back is not None: call_back(0, len(self.speakers)) for speaker_ind, s in enumerate(self.speakers): if stop_check is not None and stop_check(): break if call_back is not None: call_back(speaker_ind) call_back(process_string.format(speaker_ind, len(self.speakers), s)) discourses = self.get_discourses_of_speaker(s) for d in discourses: syllables = [] non_syllables = [] q = self.query_graph(word_type) q = q.filter(word_type.speaker.name == s) q = q.filter(word_type.discourse.name == d) q = q.order_by(word_type.begin) q = q.columns(word_type.id.column_name('id'), phone_type.id.column_name('phone_id'), word_type.begin.column_name('begin'), word_type.label.column_name('label'), word_type.end.column_name('end'), phone_type.label.column_name('phones'), phone_type.begin.column_name('begins'), phone_type.end.column_name('ends')) results = q.all() prev_id = None for w in results: phones = w['phones'] phone_ids = w['phone_id'] if not phone_ids: print('The word {} in file {} ({} to {}) did not have any phones.'.format(w['label'], d, w['begin'], w['end'])) continue phone_begins = w['begins'] phone_ends = w['ends'] vow_inds = [i for i, x in enumerate(phones) if x in syllabics] if len(vow_inds) == 0: cur_id = uuid1() if algorithm == 'probabilistic': split = split_nonsyllabic_prob(phones, onsets, codas) else: split = split_nonsyllabic_maxonset(phones, onsets) label = '.'.join(phones) row = {'id': cur_id, 'prev_id': prev_id, 'onset_id': phone_ids[0], 'break': split, 'coda_id': phone_ids[-1], 'begin': phone_begins[0], 'label': label, 'type_id': make_type_id([label], self.corpus_name), 'end': phone_ends[-1]} non_syllables.append(row) prev_id = cur_id continue for j, i in enumerate(vow_inds): cur_id = uuid1() cur_vow_id = phone_ids[i] if j == 0: begin_ind = 0 if i != 0: cur_ons_id = phone_ids[begin_ind] else: cur_ons_id = None else: prev_vowel_ind = vow_inds[j - 1] cons_string = phones[prev_vowel_ind + 1:i] if algorithm == 'probabilistic': split = split_ons_coda_prob(cons_string, onsets, codas) else: split = split_ons_coda_maxonset(cons_string, onsets) if split is None: cur_ons_id = None begin_ind = i else: begin_ind = prev_vowel_ind + 1 + split cur_ons_id = phone_ids[begin_ind] if j == len(vow_inds) - 1: end_ind = len(phones) - 1 if i != len(phones) - 1: cur_coda_id = phone_ids[end_ind] else: cur_coda_id = None else: foll_vowel_ind = vow_inds[j + 1] cons_string = phones[i + 1:foll_vowel_ind] if algorithm == 'probabilistic': split = split_ons_coda_prob(cons_string, onsets, codas) else: split = split_ons_coda_maxonset(cons_string, onsets) if split is None: cur_coda_id = None end_ind = i else: end_ind = i + split cur_coda_id = phone_ids[end_ind] begin = phone_begins[begin_ind] end = phone_ends[end_ind] label = '.'.join(phones[begin_ind:end_ind + 1]) row = {'id': cur_id, 'prev_id': prev_id, 'vowel_id': cur_vow_id, 'onset_id': cur_ons_id, 'label': label, 'type_id': make_type_id([label], self.corpus_name), 'coda_id': cur_coda_id, 'begin': begin, 'end': end} syllables.append(row) prev_id = cur_id syllables_data_to_csvs(self, s, d, syllables) nonsyls_data_to_csvs(self, s, d, non_syllables) import_syllable_csv(self, call_back, stop_check) import_nonsyl_csv(self, call_back, stop_check) if stop_check is not None and stop_check(): return if call_back is not None: call_back('Cleaning up...') for s in self.speakers: discourses = self.get_discourses_of_speaker(s) for d in discourses: self.execute_cypher( '''MATCH (s:{corpus_name}:Speaker)<-[:spoken_by]-(n:{corpus_name}:syllable)-[:spoken_in]->(d:{corpus_name}:Discourse) where s.name = $speaker_name AND d.name = $discourse_name and n.prev_id is not Null REMOVE n.prev_id'''.format(corpus_name=self.cypher_safe_name), speaker_name=s, discourse_name=d) self.hierarchy.add_annotation_type('syllable', above=self.phone_name, below=self.word_name) self.hierarchy.add_token_subsets(self, self.phone_name, ['onset', 'coda', 'nucleus']) self.hierarchy.add_token_properties(self, self.phone_name, [('syllable_position', str)]) self.encode_hierarchy() if call_back is not None: call_back('Finished!') call_back(1, 1) def enrich_syllables(self, syllable_data, type_data=None): """ Sets the data type and syllable data, initializes importers for syllable data, adds features to hierarchy for a phone Parameters ---------- syllable_data : dict the enrichment data type_data : dict By default None """ if type_data is None: type_data = {k: type(v) for k, v in next(iter(syllable_data.values())).items()} syllables_enrichment_data_to_csvs(self, syllable_data) import_syllable_enrichment_csvs(self, type_data) self.hierarchy.add_type_properties(self, 'syllable', type_data.items()) self.encode_hierarchy() def _generate_stress_enrichment(self, pattern): syllable = self.syllable all_syls = self.query_graph(syllable).all() enrich_dict = {} for item in all_syls: syl = item['label'] splitsyl = syl.split('.') nucleus = splitsyl[0] for j, seg in enumerate(splitsyl): if re.search(pattern, seg) is not None: nucleus = seg r = re.search(pattern, nucleus) if r is not None: end = nucleus[r.start(0):r.end(0)].replace("_", "") nucleus = re.sub(pattern, "", nucleus) fullpatt = str(nucleus) + str(pattern).replace("$", "") syl = re.sub(fullpatt, nucleus, syl) enrich_dict.update({syl: {'stress': end}}) return enrich_dict def _generate_tone_enrichment(self, pattern): syllable = self.syllable all_syls = self.query_graph(syllable).all() enrich_dict = {} for x in all_syls.cursors: for item in x: syl = item[0]['label'] splitsyl = syl.split('.') nucleus = splitsyl[0] for seg in splitsyl: if re.search(pattern, seg) is not None: nucleus = seg r = re.search(pattern, nucleus) if r is not None: end = nucleus[r.start(0):r.end(0)].replace("_", "") nucleus = re.sub(pattern, "", nucleus) fullpatt = str(nucleus) + str(pattern).replace("$", "") syl = re.sub(fullpatt, nucleus, syl) enrich_dict.update({syl: {'tone': end}}) return enrich_dict def encode_stress_to_syllables(self, regex=None, clean_phone_label=True): """ Use numbers (0-9) in phone labels as stress property for syllables. If ``clean_phone_label`` is True, the numbers will be removed from the phone labels. Parameters ---------- regex : str Regular expression character set for finding stress in the phone label clean_phone_label : bool Flag for removing regular expression from the phone labels """ if regex is None: regex = '[0-9]' enrich_dict = self._generate_stress_enrichment(regex) if clean_phone_label: self.remove_pattern(regex) self.enrich_syllables(enrich_dict) self.encode_hierarchy() def encode_tone_to_syllables(self, regex=None, clean_phone_label=True): """ Use numbers (0-9) in phone labels as tone property for syllables. If ``clean_phone_label`` is True, the numbers will be removed from the phone labels. Parameters ---------- regex : str Regular expression character set for finding tone in the phone label clean_phone_label : bool Flag for removing regular expression from the phone labels """ if regex is None: regex = '[0-9]' enrich_dict = self._generate_tone_enrichment(regex) if clean_phone_label: self.remove_pattern(regex) self.enrich_syllables(enrich_dict) self.encode_hierarchy() def encode_stress_from_word_property(self, word_property_name): """ Use a property on words formatted like "0-1-0" to encode stress on syllables. The number of syllables and the position of syllables within a word will also be encoded as a result of this function. Parameters ---------- word_property_name : str Property name of words that contains the stress pattern """ if 'syllable' not in self.annotation_types: raise Exception('Syllables have not been encoded.') if not self.hierarchy.has_type_property(self.word_name, word_property_name): raise Exception('Word types do not have a property {}.'.format(word_property_name)) if not self.hierarchy.has_type_property(self.word_name, 'num_syllables'): self.encode_count('word', 'syllable', 'num_syllables') if not self.hierarchy.has_type_property('syllable', 'position_in_word'): self.encode_position('word', 'syllable', 'position_in_word') for s in self.speakers: discourses = self.get_discourses_of_speaker(s) for d in discourses: statement = '''MATCH (s:syllable:{corpus_name})-[:spoken_by]->(speaker:Speaker:{corpus_name}), (s)-[:spoken_in]->(discourse:Discourse:{corpus_name}), (s)-[:contained_by]->(w:word:{corpus_name})-[:is_a]->(wt:word_type:{corpus_name}) WHERE speaker.name = $speaker_name AND discourse.name = $discourse_name AND wt.{word_property_name} is not null WITH s, w, split(wt.{word_property_name}, '-') as stresses WHERE size(stresses) = w.num_syllables SET s.stress = stresses[s.position_in_word-1]'''.format( corpus_name=self.cypher_safe_name, word_property_name=word_property_name) self.execute_cypher(statement, speaker_name=s, discourse_name=d) self.hierarchy.add_token_properties(self, 'syllable', [('stress', str)]) self.encode_hierarchy()