Source code for rfgb.utils

# -*- coding: utf-8 -*-

# Copyright © 2017-2019 rfgb Contributors
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program (at the base of this repository). If not,
# see <http://www.gnu.org/licenses/>

"""
(docstring for utils)
"""

from random import sample
from math import exp

import codecs
import json
import string


[docs]class Data(object):
    """Object containing the relational data."""

    def __init__(
        self, regression=False, advice=False, softm=False, alpha=0.0, beta=0.0
    ):
        """
        An RFGB Data object, which serves as the structure for the positives,
        negatives, facts, and other parameters.

        adviceClauses: dictionary of advice clauses.
        facts: list of strings representing facts.
        pos: dictionary of positive examples.
        neg: dictionary of negative examples.
        examples: dictionary of examples for regression.
        examplesTrueValue: true value for use during regression.
        target: Target(s) to be learned or inferred.
        literals: Literals present in facts or their type specifications.
        variableType: Type of variable for facts and target.
        """

        self.regression = regression
        self.advice = advice
        self.adviceClauses = {}
        self.facts = []
        self.pos = {}
        self.neg = {}
        self.examples = {}
        self.examplesTrueValue = {}
        self.target = None
        self.literals = []
        self.literalTypes = {}
        self.variableType = {}

        self.softm = softm
        self.alpha = alpha
        self.beta = beta

[docs]    def setFacts(self, facts):
        """
        Mutate the facts in the data object.

        :param facts: List of strings representing the facts.
        :type facts: list.

        :returns: None
        """
        self.facts = facts

[docs]    def getFacts(self):
        """returns the facts in the data"""
        return self.facts

[docs]    def setPos(self, pos, target):
        """
        Set positive examples based on the contents of a list.
        """
        for example in pos:
            if example.split("(")[0] == target:
                # Set initial gradient to 0.5 for positives.
                self.pos[example] = 1 - Utils.sigmoid(-1.8)

[docs]    def setExamples(self, examples, target):
        """
        Set examples for regression.
        """
        for example in examples:
            # Get the predicate.
            predicate = example.split(" ")[0]
            # Get the true regression value.
            value = float(example.split(" ")[1])
            if predicate.split("(")[0] == target:
                # Store the true value in examplesTrueValue dictionary.
                self.examplesTrueValue[predicate] = value
                # Set the value, otherwise none.
                self.examples[predicate] = value

[docs]    def setNeg(self, neg, target):
        """
        Set negative examples based on the contents of a list.
        """
        for example in neg:
            if example.split("(")[0] == target:
                # Set initial gradient to -0.5 for negative examples.
                self.neg[example] = -Utils.sigmoid(-1.8)

[docs]    def setTarget(self, bk, target):
        """
        Sets self.target as a target string.
        Sets self.variableType

        :param bk: List of strings representing modes.
        :type bk: list.

        :param target: Target relation or attribute.
        :type target: str.

        :returns: None

        Example:

        .. code-block:: python

                        from rfgb.utils import Data

                        data = Data(regression=False)
                        background = ['friends(+person,-person)',
                                      'friends(-person,+person)',
                                      'smokes(+person)',
                                      'cancer(-person)']
                        target = 'cancer'

                        data.setTarget(background, target)

                        print(data.target)
                        # 'cancer(C)'
        """
        # targetTypes are the types of variables in the target predicate.
        targetTypes = [i[:-1].split("(")[1].split(",") for i in bk if target in i][0]
        targetTypes = list(map(Utils.removeModeSymbols, targetTypes))

        targetArity = len(targetTypes)
        targetVariables = sample(Utils.UniqueVariableCollection, targetArity)

        self.target = target + "("
        for variable in targetVariables:
            self.target += variable + ","
            self.variableType[variable] = targetTypes[targetVariables.index(variable)]
        self.target = self.target[:-1] + ")"

[docs]    def getTarget(self):
        """
        Returns the target.
        """
        return self.target

[docs]    def getExampleTrueValue(self, example):
        """
        Returns true regression value of an example for regression learning.
        """
        return self.examplesTrueValue[example]

[docs]    def getValue(self, example):
        """
        Returns the regression value for an example.

        Example:

        .. code-block:: python

                        from rfgb.utils import Utils
                        from rfgb.utils import Data

                        trainingData = Utils.readTrainingData('cancer',
                                            path='testDomain/ToyCancer/train/')

                        x = trainingData.getValue('cancer(earl)')
                        # x == -0.5, since earl doesn't have cancer.

                        y = trainingData.getValue('cancer(alice)')
                        # y == 0.5, since alice does have cancer
        """
        if self.regression:
            return self.examples[example]

        if example in self.pos:
            return self.pos[example]
        else:
            return self.neg[example]

[docs]    def setBackground(self, bk):
        """
        Obtains the literals and their type specifications. Types can be
        either variable or a list of constants.
        """

        bkWithoutTargets = [line for line in bk if "+" in line or "-" in line]

        # For every literal, obtain name and type specifications.
        for literalBk in bkWithoutTargets:
            literalName = literalBk.split("(")[0]
            literalTypeSpecification = literalBk[:-1].split("(")[1].split(",")
            self.literalTypes[literalName] = literalTypeSpecification
            self.literals.append([literalName, literalTypeSpecification])

[docs]    def getLiterals(self):
        """gets all the literals in the facts"""
        return self.literals

[docs]    def variance(self, examples):
        """
        Calculates the variance of the regression values from a subset of the
        data.
        """

        if not examples:
            return 0

        total = sum([self.getValue(example) for example in examples])
        numberOfExamples = len(examples)
        mean = total / float(numberOfExamples)
        sumOfSquaredError = sum(
            [(self.getValue(example) - mean) ** 2 for example in examples]
        )

        return sumOfSquaredError / float(numberOfExamples)  # return variance


[docs]class Utils(object):
    """
    Class of utilities used by rfgb, such as reading files, removing mode
    symbols, calculating Cartesian Products, etc.
    """

    # Attribute to store data (facts, positves, negatives)
    data = None
    UniqueVariableCollection = set(list(string.ascii_uppercase))

[docs]    @staticmethod
    def sigmoid(x):
        """
        :param x: Number to apply sigmoid to.
        :type x: int or float

        :returns: ``exp(x)/float(1+exp(x))``
        :rtype: float
        """
        return exp(x) / float(1 + exp(x))

[docs]    @staticmethod
    def removeModeSymbols(inputString):
        """
        Returns a string with the mode symbols (+,-,#) removed.

        Example:

        .. code-block:: python

                        from rfgb.utils import Utils

                        removeModeSymbols('#city')
                        # == 'city'

                        i = ['+drinks', '-drink', '-city']
                        o = list(map(removeModeSymbols, i))
                        # o == ['drinks', 'drink', 'city']
        """
        return inputString.replace("+", "").replace("-", "").replace("#", "")

[docs]    @staticmethod
    def addVariableTypes(literal):
        """
        As literals are encountered, update Utils.data.variableType with the
        type of the variables encountered.

        :param literal: A literal of the form smokes(W) or friends(A,B)
        :type literal: str.
        """

        # Get the name of the literal.
        literalName = literal.split("(")[0]

        # Get background info for the literal
        literalTypeSpecification = Utils.data.literalTypes[literalName]

        # Get the arguments
        literalArguments = literal[:-1].split("(")[1].split(",")

        # Get the number of arguments.
        for i in range(len(literalArguments)):
            if literalTypeSpecification[i][0] != "[":
                variable = literalArguments[i]
                if variable not in Utils.data.variableType.keys():
                    Utils.data.variableType[variable] = literalTypeSpecification[i][1:]

[docs]    @staticmethod
    def getleafValue(examples):
        """returns average of regression values for examples"""
        if not examples:
            return 0
        total = 0
        for example in examples:
            total += Utils.data.getValue(example)
        return total / float(len(examples))

[docs]    @staticmethod
    def save(location, saveItem):
        """
        Dumps json version of learnedDecisionTree to location.

        :param location: Name of the file to write.
        :type location: str.

        :returns: None.
        """
        with codecs.open(location, encoding="utf-8", mode="w") as f:
            json.dump(saveItem, f, indent=2)

[docs]    @staticmethod
    def load(location):
        """
        Loads json version of learnedDecisionTree from location.

        :param location: Name of the file to load.
        :type location: str.

        :returns: None.
        """
        with codecs.open(location, encoding="utf-8", mode="r") as f:
            return json.load(f)

[docs]    @staticmethod
    def readTrainingData(
        target,
        path="train/",
        regression=False,
        advice=False,
        softm=False,
        alpha=0.0,
        beta=0.0,
    ):
        """
        Reads the training data from files.

        :param target: The target predicate.
        :type target: str.

        :param path: Path to the training data.
        :type path: str.

        :param regression: Read from ``examples.txt`` instead of ``pos.txt``
                           and ``neg.txt``.
        :type regression: bool

        :param advice: Read advice from an advice file, which should be
                       contained in the same directory as the examples.
        :type advice: bool

        :default path: 'train/'
        :default regression: False
        :default advice: False

        :returns: A Data object representing the training data.
        :rtype: :py:class:`.utils.Data`
        """

        Utils.data = Data(
            regression=regression, advice=advice, softm=softm, alpha=alpha, beta=beta
        )
        # trainData = Data(regression=regression, advice=advice)
        # Utils.data.regression = regression
        # Utils.data.advice = advice

        if advice:
            with open(path + "advice.txt") as fp:  # read advice from train folder
                adviceFileLines = fp.read().splitlines()

                for line in adviceFileLines:
                    adviceClause = line.split(" ")[0]  # get advice clause

                    Utils.data.adviceClauses[adviceClause] = {}
                    # trainData.adviceClauses[adviceClause] = {}

                    preferredTargets = line.split(" ")[1][1:-1].split(",")
                    if preferredTargets[0]:
                        Utils.data.adviceClauses[adviceClause][
                            "preferred"
                        ] = preferredTargets
                        # trainData.adviceClauses[adviceClause]['preferred'] = preferredTargets
                    elif not preferredTargets[0]:
                        Utils.data.adviceClauses[adviceClause]["preferred"] = []
                        # trainData.adviceClauses[adviceClause]['preferred'] = []

                    nonPreferredTargets = line.split(" ")[2][1:-1].split(",")
                    if nonPreferredTargets[0]:
                        Utils.data.adviceClauses[adviceClause][
                            "nonPreferred"
                        ] = nonPreferredTargets
                        # trainData.adviceClauses[adviceClause]['nonPreferred'] = nonPreferredTargets
                    elif not nonPreferredTargets[0]:
                        Utils.data.adviceClauses[adviceClause]["nonPreferred"] = []
                        # trainData.adviceClauses[adviceClause]['nonPreferred'] = []

        with open(path + "facts.txt") as fac:
            Utils.data.setFacts(fac.read().splitlines())
            # trainData.setFacts(fac.read().splitlines())

        if regression:
            with open(path + "examples.txt") as exam:
                Utils.data.setExamples(exam.read().splitlines(), target)
                # trainData.setExamples(exam.read().splitlines(), target)
        else:
            with open(path + "pos.txt") as pos:
                Utils.data.setPos(pos.read().splitlines(), target)
                # trainData.setPos(pos.read().splitlines(), target)
            with open(path + "neg.txt") as neg:
                Utils.data.setNeg(neg.read().splitlines(), target)
                # trainData.setNeg(neg.read().splitlines(), target)

        with open(path + "bk.txt") as fp:
            bk = fp.read().splitlines()

            Utils.data.setBackground(bk)
            Utils.data.setTarget(bk, target)
            # trainData.setBackground(bk)
            # trainData.setTarget(bk, target, regression=regression)

        return Utils.data
        # return trainData

[docs]    @staticmethod
    def readTestData(target, path="test/", regression=False):
        """
        Reads the testing data from files.

        :param target: The target predicate.
        :type target: str.

        :param path: Path to the training data.
        :type path: str.

        :param regression: Read from ``examples.txt`` instead of ``pos.txt``
                           and ``neg.txt``.
        :type regression: bool

        :default path: 'train/'
        :default regression: False

        :returns: A Data object representing the training data.
        :rtype: :py:class:`.utils.Data`
        """

        testData = Data()
        testData.regression = regression

        with open(path + "facts.txt") as facts:
            testData.setFacts(facts.read().splitlines())

        if regression:
            with open(path + "examples.txt") as exam:
                examples = exam.read().splitlines()
                testData.setExamples(examples, target)
        else:
            # If we are not using regression, read from pos.txt and neg.txt
            with open(path + "pos.txt") as pos:
                testData.setPos(pos.read().splitlines(), target)
            with open(path + "neg.txt") as neg:
                testData.setNeg(neg.read().splitlines(), target)

        return testData

[docs]    @staticmethod
    def cartesianProduct(itemSets):
        """
        Returns the Cartesian Product of all sets contained in the item sets.
        """

        # Create new input where each element is in its own set.
        modifiedItemSets = []
        for itemSet in itemSets:
            modifiedItemSet = []
            for element in itemSet:
                modifiedItemSet.append([element])
            modifiedItemSets.append(modifiedItemSet)

        # Perform Cartesian Product of the first two sets.
        while len(modifiedItemSets) > 1:
            set1 = modifiedItemSets[0]
            set2 = modifiedItemSets[1]
            pairWiseProducts = []
            for item1 in set1:
                for item2 in set2:
                    # Cartesian Product performed here.
                    pairWiseProducts.append(item1 + item2)

            # Remove the first two sets.
            modifiedItemSets.remove(set1)
            modifiedItemSets.remove(set2)
            # Insert the Cartesian Product in their place and repeat.
            modifiedItemSets.insert(0, pairWiseProducts)

        # Return the final Cartesian Product Sets
        return modifiedItemSets[0]