Source code for polyglotdb.corpus.phonological

import re

from polyglotdb.corpus.lexical import LexicalContext
from polyglotdb.io.enrichment.features import enrich_features_from_csv, parse_file
from polyglotdb.io.importer import feature_data_to_csvs, import_feature_csvs



[docs]
class PhonologicalContext(LexicalContext):
    """
    Class that contains methods for dealing specifically with phones
    """

    def enrich_inventory_from_csv(self, path):
        """
        Enriches corpus from a csv file

        Parameters
        ----------
        path : str
            the path to the csv file
        """

        enrich_features_from_csv(self, path)

    def reset_inventory_csv(self, path):
        """
        Remove properties that were encoded via a CSV file

        Parameters
        ----------
        path : str
            CSV file to get property names from
        """
        data, type_data = parse_file(path, labels=[])

        property_names = [x for x in type_data.keys()]
        self.reset_features(property_names)

    def encode_class(self, phones, label):
        """
        encodes phone classes

        Parameters
        ----------
        phones : list
            a list of phones
        label : str
            the label for the class
        """
        self.encode_type_subset("phone", phones, label)

    def reset_class(self, label):
        """
        Reset and remove a subset

        Parameters
        ----------
        label : str
            Subset name to remove
        """
        self.reset_type_subset("phone", label)

    def encode_features(self, feature_dict):
        """
        gets the phone if it exists, queries for each phone and sets type to kwargs (?)

        Parameters
        ----------
        feature_dict : dict
            features to encode
        """
        phone = getattr(self, "lexicon_" + self.phone_name)
        for k, v in feature_dict.items():
            q = self.query_lexicon(phone).filter(phone.label == k)
            q.set_properties(**v)
        self.encode_hierarchy()

    def reset_features(self, feature_names):
        """
        resets features

        Parameters
        ----------
        feature_names : list
            list of names of features to remove
        """
        phone = getattr(self, "lexicon_" + self.phone_name)
        q = self.query_lexicon(phone)
        q.set_properties(**{x: None for x in feature_names})
        self.hierarchy.remove_type_properties(self, self.phone_name, feature_names)
        self.encode_hierarchy()

    def enrich_features(self, feature_data, type_data=None):
        """
        Sets the data type and feature data, initializes importers for feature data, adds features to hierarchy for a phone

        Parameters
        ----------
        feature_data : dict
            the enrichment data
        type_data : dict
            By default None
        """

        if type_data is None:
            type_data = {k: type(v) for k, v in next(iter(feature_data.values())).items()}
        labels = set(self.phones)
        feature_data = {k: v for k, v in feature_data.items() if k in labels}
        feature_data_to_csvs(self, feature_data)
        import_feature_csvs(self, type_data)
        self.hierarchy.add_type_properties(self, self.phone_name, type_data.items())
        self.encode_hierarchy()

    def remove_pattern(self, pattern="[0-2]"):
        """
        removes a stress or tone pattern from all phones

        Parameters
        ----------
        pattern : str
            the regular expression for the pattern to remove
            Defaults to '[0-2]'

        """
        phone = getattr(self, self.phone_name)
        if pattern == "":
            pattern = "[0-2]"
        q = self.query_graph(phone)
        results = q.all()
        oldphones = []
        length = 0
        newphones = []
        toAdd = {}
        for item in results:
            phone = item["label"]
            if re.search(pattern, phone) is not None:
                newphone = re.sub(pattern, "", phone)
                length = len(phone) - len(newphone)
                oldphones.append(phone)
                newphones.append(newphone)
                toAdd.update({"label": newphone})
        statement = """MATCH (n:{phone_name}{type}:{corpus_name}) WHERE n.label in $oldphones
        SET n.oldlabel = n.label
        SET n.label=substring(n.label,0,size(n.label)-{length})"""
        norm_statement = statement.format(
            phone_name=self.phone_name,
            type="",
            corpus_name=self.cypher_safe_name,
            length=length,
        )
        type_statement = statement.format(
            phone_name=self.phone_name,
            type="_type",
            corpus_name=self.cypher_safe_name,
            length=length,
        )
        self.execute_cypher(norm_statement, oldphones=oldphones)
        self.execute_cypher(type_statement, oldphones=oldphones)
        self.encode_syllabic_segments(newphones)
        self.encode_syllables("maxonset")

    def reset_to_old_label(self):
        """
        Reset phones back to their old labels which include stress and tone
        """
        phones = []
        getphone = f"""MATCH (n:{self.phone_name}_type:{self.cypher_safe_name})
        WHERE n.oldlabel IS NOT NULL
        RETURN n.oldlabel"""
        results = self.execute_cypher(getphone)
        for item in results:
            phones.append(item["n.oldlabel"])

        statement = f"""MATCH (n:{self.phone_name}{{type}}:{self.cypher_safe_name})
        WHERE n.oldlabel IS NOT NULL SET n.label = n.oldlabel"""
        norm_statement = statement.format(type="")
        type_statement = statement.format(type="_type")
        self.execute_cypher(norm_statement)
        self.execute_cypher(type_statement)
        self.encode_syllabic_segments(phones)
        self.encode_syllables("maxonset")