Source code for polyglotdb.corpus.syllabic

from uuid import uuid1

import re
from ..io.importer import (syllables_data_to_csvs, import_syllable_csv,
                           nonsyls_data_to_csvs, import_nonsyl_csv,
                           create_syllabic_csvs, create_nonsyllabic_csvs,
                           syllables_enrichment_data_to_csvs, import_syllable_enrichment_csvs)

from ..io.helper import make_type_id

from ..syllabification.probabilistic import norm_count_dict, split_nonsyllabic_prob, split_ons_coda_prob
from ..syllabification.maxonset import split_nonsyllabic_maxonset, split_ons_coda_maxonset
from .utterance import UtteranceContext


def make_label_safe_for_cypher(label):
    """
    Make a given subset name safe for use in Cypher

    Parameters
    ----------
    label : str
        Subset name

    Returns
    -------
    str
        Cypher-safe name
    """
    if not label.startswith('`'):
        label = '`' + label
    if not label.endswith('`'):
        label += '`'
    return label



[docs]
class SyllabicContext(UtteranceContext):
    """
    Class that contains methods for dealing specifically with syllables
    """

    def find_onsets(self, syllabic_label='syllabic'):
        """
        Gets syllable onsets across the corpus

        Parameters
        ----------
        syllabic_label : str
            Subset to use for syllabic segments (i.e., nuclei)

        Returns
        -------
        data : dict
            A dictionary with onset values as keys and frequency values as values
        """
        from collections import Counter
        data = Counter()
        for s in self.speakers:
            discourses = self.get_discourses_of_speaker(s)
            for d in discourses:
                statement = '''match (w:{word_name}:{corpus_name})-[:spoken_by]->(s:Speaker:{corpus_name}),
                (w)-[:spoken_in]->(d:Discourse:{corpus_name})
        where (w)<-[:contained_by*1..2]-()-[:is_a]->(:{syllabic_name})
        AND s.name = $speaker
        AND d.name = $discourse
        with w
        match (n:{phone_name}:{corpus_name})-[:is_a]->(t:{corpus_name}:{syllabic_name}),
        (n)-[:contained_by*1..2]->(w)
        with w, n
        order by n.begin
        with w,collect(n)[0..1] as coll unwind coll as n
        
        MATCH (pn:{phone_name}:{corpus_name})-[:contained_by*1..2]->(w)
        where not (pn)<-[:precedes]-()-[:contained_by*1..2]->(w)
        with w, n,pn
        match p = shortestPath((pn)-[:precedes*0..10]->(n))
        with [x in nodes(p)[0..-1]|x.label] as onset
        return onset, count(onset) as freq'''.format(corpus_name=self.cypher_safe_name,
                                                     word_name=self.word_name,
                                                     syllabic_name=make_label_safe_for_cypher(syllabic_label),
                                                     phone_name=self.phone_name)
                res = self.execute_cypher(statement, speaker=s, discourse=d)
                for r in res:
                    data[tuple(r['onset'])] += r['freq']
        return data

    def find_codas(self, syllabic_label='syllabic'):
        """
        Gets syllable codas across the corpus

        Parameters
        ----------
        syllabic_label : str
            Subset to use for syllabic segments (i.e., nuclei)

        Returns
        -------
        data : dict
            A dictionary with coda values as keys and frequency values as values
        """
        from collections import Counter
        data = Counter()
        for s in self.speakers:
            discourses = self.get_discourses_of_speaker(s)
            for d in discourses:
                statement = '''match (w:{word_name}:{corpus_name})-[:spoken_by]->(s:Speaker:{corpus_name}),
                (w)-[:spoken_in]->(d:Discourse:{corpus_name})
        where (w)<-[:contained_by*1..2]-()-[:is_a]->(:{syllabic_name})
        AND s.name = $speaker
        AND d.name = $discourse
        with w
        match (n:{phone_name}:{corpus_name})-[:is_a]->(t:{corpus_name}:{syllabic_name}),
        (n)-[:contained_by*1..2]->(w)
        with w, n
        order by n.begin DESC
        with w,collect(n)[0..1] as coll unwind coll as n
        
        MATCH (pn:{phone_name}:{corpus_name})-[:contained_by*1..2]->(w)
        where not (pn)-[:precedes]->()-[:contained_by*1..2]->(w)
        with w, n,pn
        match p = shortestPath((n)-[:precedes*0..10]->(pn))
        with [x in nodes(p)[1..]|x.label] as coda
        return coda, count(coda) as freq'''.format(corpus_name=self.cypher_safe_name,
                                                   word_name=self.word_name,
                                                   syllabic_name=make_label_safe_for_cypher(syllabic_label),
                                                   phone_name=self.phone_name)

                res = self.execute_cypher(statement, speaker=s, discourse=d)
                for r in res:
                    data[tuple(r['coda'])] += r['freq']
        return data

    def encode_syllabic_segments(self, phones):
        """
        Encode a list of phones as 'syllabic'

        Parameters
        ----------
        phones : list
            A list of vowels and syllabic consonants
        """
        self.encode_class(phones, 'syllabic')

    def reset_syllables(self, call_back=None, stop_check=None):
        """
        Resets syllables, removes syllable annotation, removes onset, coda, and nucleus labels

        Parameters
        ----------
        call_back : callable
            Function to monitor progress
        stop_check : callable
            Function the check whether the process should terminate early
        """
        if call_back is not None:
            call_back('Resetting syllables...')
            number = self.execute_cypher(
                '''MATCH (n:syllable:%s) return count(*) as number ''' % self.cypher_safe_name)[0]['number']
            call_back(0, number)
        for s in self.speakers:
            discourses = self.get_discourses_of_speaker(s)
            for d in discourses:
                phone_rel_statement = '''
                        MATCH (p:{phone_name}:{corpus})-[:contained_by]->(s:syllable:{corpus}),
                        (s)-[:contained_by]->(w:{word_name}:{corpus}),
                        (s)-[:spoken_by]->(sp:Speaker:{corpus}),
                        (s)-[:spoken_in]->(d:Discourse:{corpus})
                        WHERE sp.name = $speaker_name
                        AND d.name = $discourse_name
                        with p,w
                        CREATE (p)-[:contained_by]->(w)
                '''.format(corpus=self.cypher_safe_name,
                           word_name=self.word_name,
                           phone_name=self.phone_name)
                self.execute_cypher(phone_rel_statement, speaker_name=s, discourse_name=d)

                phone_label_statement = '''
                        MATCH (p:{phone_name}:{corpus})-[:spoken_by]->(sp:Speaker:{corpus}),
                        (p)-[:spoken_in]->(d:Discourse:{corpus})
                        WHERE sp.name = $speaker_name
                        AND d.name = $discourse_name
                        with p
                        REMOVE p:onset, p:nucleus, p:coda, p.syllable_position
                '''.format(corpus=self.cypher_safe_name,
                           word_name=self.word_name,
                           phone_name=self.phone_name)
                self.execute_cypher(phone_label_statement, speaker_name=s, discourse_name=d)
                num_deleted = 0
                deleted = 1000
                delete_statement = '''
                MATCH (s:syllable:{corpus})-[:spoken_by]->(sp:Speaker:{corpus}),
                        (s)-[:spoken_in]->(d:Discourse:{corpus})
                        WHERE sp.name = $speaker_name
                        AND d.name = $discourse_name
                        WITH s
                        LIMIT 1000
                        DETACH DELETE s
                        RETURN count(s) as deleted_count
                '''.format(corpus=self.cypher_safe_name)
                while deleted > 0:
                    if stop_check is not None and stop_check():
                        break
                    deleted = self.execute_cypher(delete_statement, speaker_name=s, discourse_name=d)[0][
                        'deleted_count']

                    num_deleted += deleted
                    if call_back is not None:
                        call_back(num_deleted)

        statement = '''MATCH (st:syllable_type:{corpus})
                               WITH st
                               DETACH DELETE st'''.format(corpus=self.cypher_safe_name)
        self.execute_cypher(statement)
        try:
            self.hierarchy.remove_annotation_type('syllable')
            self.hierarchy.remove_token_subsets(self, self.phone_name, ['onset', 'coda', 'nucleus'])
            self.hierarchy.remove_token_properties(self, self.phone_name, ['syllable_position'])
            # self.reset_to_old_label()
            self.encode_hierarchy()
        except KeyError:
            pass

    @property
    def has_syllabics(self):
        """
        Check whether there is a phone subset named ``syllabic``

        Returns
        -------
        bool
            True if ``syllabic`` is found as a phone subset
        """
        return 'syllabic' in self.hierarchy.subset_types[self.phone_name]

    @property
    def has_syllables(self):
        """
        Check whether the corpus has syllables encoded

        Returns
        -------
        bool
            True if the syllables are in the Hierarchy
        """
        return 'syllable' in self.hierarchy.annotation_types

    def encode_syllables(self, algorithm='maxonset', syllabic_label='syllabic', call_back=None, stop_check=None):
        """
        Encodes syllables to a corpus

        Parameters
        ----------
        algorithm : str, defaults to 'maxonset'
            determines which algorithm will be used to encode syllables
        syllabic_label : str
            Subset to use for syllabic segments (i.e., nuclei)
        call_back : callable
            Function to monitor progress
        stop_check : callable
            Function the check whether the process should terminate early
        """

        self.reset_syllables(call_back, stop_check)

        onsets = self.find_onsets(syllabic_label=syllabic_label)
        if algorithm == 'probabilistic':
            onsets = norm_count_dict(onsets, onset=True)
            codas = self.find_codas(syllabic_label=syllabic_label)
            codas = norm_count_dict(codas, onset=False)
        elif algorithm == 'maxonset':
            onsets = set(onsets.keys())
        else:
            raise NotImplementedError

        statement = '''MATCH (n:{}:{}) return n.label as label'''.format(self.cypher_safe_name,
                                                                         make_label_safe_for_cypher(syllabic_label))
        res = self.execute_cypher(statement)
        syllabics = set(x['label'] for x in res)

        word_type = getattr(self, self.word_name)
        phone_type = getattr(word_type, self.phone_name)

        create_syllabic_csvs(self)
        create_nonsyllabic_csvs(self)

        splits = self.speakers
        process_string = 'Processing speaker {} of {} ({})...'
        if call_back is not None:
            call_back(0, len(self.speakers))

        for speaker_ind, s in enumerate(self.speakers):
            if stop_check is not None and stop_check():
                break
            if call_back is not None:
                call_back(speaker_ind)
                call_back(process_string.format(speaker_ind, len(self.speakers), s))
            discourses = self.get_discourses_of_speaker(s)
            for d in discourses:
                syllables = []
                non_syllables = []
                q = self.query_graph(word_type)
                q = q.filter(word_type.speaker.name == s)
                q = q.filter(word_type.discourse.name == d)
                q = q.order_by(word_type.begin)
                q = q.columns(word_type.id.column_name('id'), phone_type.id.column_name('phone_id'),
                              word_type.begin.column_name('begin'),
                              word_type.label.column_name('label'),
                              word_type.end.column_name('end'),
                              phone_type.label.column_name('phones'),
                              phone_type.begin.column_name('begins'),
                              phone_type.end.column_name('ends'))
                results = q.all()
                prev_id = None
                for w in results:
                    phones = w['phones']
                    phone_ids = w['phone_id']

                    if not phone_ids:
                        print('The word {} in file {} ({} to {}) did not have any phones.'.format(w['label'], d,
                                                                                                  w['begin'], w['end']))
                        continue
                    phone_begins = w['begins']
                    phone_ends = w['ends']
                    vow_inds = [i for i, x in enumerate(phones) if x in syllabics]
                    if len(vow_inds) == 0:
                        cur_id = uuid1()
                        if algorithm == 'probabilistic':
                            split = split_nonsyllabic_prob(phones, onsets, codas)
                        else:
                            split = split_nonsyllabic_maxonset(phones, onsets)
                        label = '.'.join(phones)
                        row = {'id': cur_id, 'prev_id': prev_id,
                               'onset_id': phone_ids[0],
                               'break': split,
                               'coda_id': phone_ids[-1],
                               'begin': phone_begins[0],
                               'label': label,
                               'type_id': make_type_id([label], self.corpus_name),
                               'end': phone_ends[-1]}
                        non_syllables.append(row)
                        prev_id = cur_id
                        continue
                    for j, i in enumerate(vow_inds):
                        cur_id = uuid1()
                        cur_vow_id = phone_ids[i]
                        if j == 0:
                            begin_ind = 0
                            if i != 0:
                                cur_ons_id = phone_ids[begin_ind]
                            else:
                                cur_ons_id = None
                        else:
                            prev_vowel_ind = vow_inds[j - 1]
                            cons_string = phones[prev_vowel_ind + 1:i]
                            if algorithm == 'probabilistic':
                                split = split_ons_coda_prob(cons_string, onsets, codas)
                            else:
                                split = split_ons_coda_maxonset(cons_string, onsets)
                            if split is None:
                                cur_ons_id = None
                                begin_ind = i
                            else:
                                begin_ind = prev_vowel_ind + 1 + split
                                cur_ons_id = phone_ids[begin_ind]

                        if j == len(vow_inds) - 1:
                            end_ind = len(phones) - 1
                            if i != len(phones) - 1:
                                cur_coda_id = phone_ids[end_ind]
                            else:
                                cur_coda_id = None
                        else:
                            foll_vowel_ind = vow_inds[j + 1]
                            cons_string = phones[i + 1:foll_vowel_ind]
                            if algorithm == 'probabilistic':
                                split = split_ons_coda_prob(cons_string, onsets, codas)
                            else:
                                split = split_ons_coda_maxonset(cons_string, onsets)
                            if split is None:
                                cur_coda_id = None
                                end_ind = i
                            else:
                                end_ind = i + split
                                cur_coda_id = phone_ids[end_ind]
                        begin = phone_begins[begin_ind]
                        end = phone_ends[end_ind]
                        label = '.'.join(phones[begin_ind:end_ind + 1])
                        row = {'id': cur_id, 'prev_id': prev_id,
                               'vowel_id': cur_vow_id, 'onset_id': cur_ons_id,
                               'label': label,
                               'type_id': make_type_id([label], self.corpus_name),
                               'coda_id': cur_coda_id, 'begin': begin, 'end': end}
                        syllables.append(row)
                        prev_id = cur_id
                syllables_data_to_csvs(self, s, d, syllables)
                nonsyls_data_to_csvs(self, s, d, non_syllables)
        import_syllable_csv(self, call_back, stop_check)
        import_nonsyl_csv(self, call_back, stop_check)
        if stop_check is not None and stop_check():
            return

        if call_back is not None:
            call_back('Cleaning up...')
        for s in self.speakers:
            discourses = self.get_discourses_of_speaker(s)
            for d in discourses:
                self.execute_cypher(
                    '''MATCH (s:{corpus_name}:Speaker)<-[:spoken_by]-(n:{corpus_name}:syllable)-[:spoken_in]->(d:{corpus_name}:Discourse)
                    where s.name = $speaker_name 
                    AND d.name = $discourse_name and n.prev_id is not Null 
                    REMOVE n.prev_id'''.format(corpus_name=self.cypher_safe_name), speaker_name=s, discourse_name=d)

        self.hierarchy.add_annotation_type('syllable', above=self.phone_name, below=self.word_name)
        self.hierarchy.add_token_subsets(self, self.phone_name, ['onset', 'coda', 'nucleus'])
        self.hierarchy.add_token_properties(self, self.phone_name, [('syllable_position', str)])
        self.encode_hierarchy()
        if call_back is not None:
            call_back('Finished!')
            call_back(1, 1)

    def enrich_syllables(self, syllable_data, type_data=None):
        """
        Sets the data type and syllable data, initializes importers for syllable data,
        adds features to hierarchy for a phone

        Parameters
        ----------
        syllable_data : dict
            the enrichment data
        type_data : dict
            By default None
        """
        if type_data is None:
            type_data = {k: type(v) for k, v in next(iter(syllable_data.values())).items()}
        syllables_enrichment_data_to_csvs(self, syllable_data)
        import_syllable_enrichment_csvs(self, type_data)
        self.hierarchy.add_type_properties(self, 'syllable', type_data.items())
        self.encode_hierarchy()

    def _generate_stress_enrichment(self, pattern):
        syllable = self.syllable
        all_syls = self.query_graph(syllable).all()
        enrich_dict = {}

        for item in all_syls:
            syl = item['label']
            splitsyl = syl.split('.')
            nucleus = splitsyl[0]
            for j, seg in enumerate(splitsyl):
                if re.search(pattern, seg) is not None:
                    nucleus = seg
            r = re.search(pattern, nucleus)
            if r is not None:
                end = nucleus[r.start(0):r.end(0)].replace("_", "")
                nucleus = re.sub(pattern, "", nucleus)
                fullpatt = str(nucleus) + str(pattern).replace("$", "")
                syl = re.sub(fullpatt, nucleus, syl)
                enrich_dict.update({syl: {'stress': end}})
        return enrich_dict

    def _generate_tone_enrichment(self, pattern):
        syllable = self.syllable
        all_syls = self.query_graph(syllable).all()
        enrich_dict = {}
        for x in all_syls.cursors:
            for item in x:
                syl = item[0]['label']
                splitsyl = syl.split('.')
                nucleus = splitsyl[0]
                for seg in splitsyl:
                    if re.search(pattern, seg) is not None:
                        nucleus = seg
                r = re.search(pattern, nucleus)
                if r is not None:
                    end = nucleus[r.start(0):r.end(0)].replace("_", "")
                    nucleus = re.sub(pattern, "", nucleus)
                    fullpatt = str(nucleus) + str(pattern).replace("$", "")
                    syl = re.sub(fullpatt, nucleus, syl)

                    enrich_dict.update({syl: {'tone': end}})
        return enrich_dict

    def encode_stress_to_syllables(self, regex=None, clean_phone_label=True):
        """
        Use numbers (0-9) in phone labels as stress property for syllables.  If ``clean_phone_label`` is True,
        the numbers will be removed from the phone labels.

        Parameters
        ----------
        regex : str
            Regular expression character set for finding stress in the phone label
        clean_phone_label : bool
            Flag for removing regular expression from the phone labels
        """
        if regex is None:
            regex = '[0-9]'

        enrich_dict = self._generate_stress_enrichment(regex)

        if clean_phone_label:
            self.remove_pattern(regex)
        self.enrich_syllables(enrich_dict)
        self.encode_hierarchy()

    def encode_tone_to_syllables(self, regex=None, clean_phone_label=True):
        """
        Use numbers (0-9) in phone labels as tone property for syllables.  If ``clean_phone_label`` is True, the numbers
        will be removed from the phone labels.

        Parameters
        ----------
        regex : str
            Regular expression character set for finding tone in the phone label
        clean_phone_label : bool
            Flag for removing regular expression from the phone labels
        """
        if regex is None:
            regex = '[0-9]'

        enrich_dict = self._generate_tone_enrichment(regex)

        if clean_phone_label:
            self.remove_pattern(regex)
        self.enrich_syllables(enrich_dict)
        self.encode_hierarchy()

    def encode_stress_from_word_property(self, word_property_name):
        """
        Use a property on words formatted like "0-1-0" to encode stress on syllables.

        The number of syllables and the position of syllables within a word will also be encoded
        as a result of this function.

        Parameters
        ----------
        word_property_name : str
            Property name of words that contains the stress pattern

        """
        if 'syllable' not in self.annotation_types:
            raise Exception('Syllables have not been encoded.')
        if not self.hierarchy.has_type_property(self.word_name, word_property_name):
            raise Exception('Word types do not have a property {}.'.format(word_property_name))
        if not self.hierarchy.has_type_property(self.word_name, 'num_syllables'):
            self.encode_count('word', 'syllable', 'num_syllables')
        if not self.hierarchy.has_type_property('syllable', 'position_in_word'):
            self.encode_position('word', 'syllable', 'position_in_word')

        for s in self.speakers:
            discourses = self.get_discourses_of_speaker(s)
            for d in discourses:
                statement = '''MATCH (s:syllable:{corpus_name})-[:spoken_by]->(speaker:Speaker:{corpus_name}),
                            (s)-[:spoken_in]->(discourse:Discourse:{corpus_name}),
                            (s)-[:contained_by]->(w:word:{corpus_name})-[:is_a]->(wt:word_type:{corpus_name})
                            WHERE speaker.name = $speaker_name
                            AND discourse.name = $discourse_name
                            AND wt.{word_property_name} is not null
                            WITH s, w, split(wt.{word_property_name}, '-') as stresses
                            WHERE size(stresses) = w.num_syllables
                            SET s.stress = stresses[s.position_in_word-1]'''.format(
                    corpus_name=self.cypher_safe_name, word_property_name=word_property_name)
                self.execute_cypher(statement, speaker_name=s, discourse_name=d)
        self.hierarchy.add_token_properties(self, 'syllable', [('stress', str)])
        self.encode_hierarchy()