Source code for rfgb.utils

# -*- coding: utf-8 -*-

# Copyright © 2017-2019 rfgb Contributors
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program (at the base of this repository). If not,
# see <http://www.gnu.org/licenses/>

"""
(docstring for utils)
"""

from random import sample
from math import exp

import codecs
import json
import string


[docs]class Data(object): """Object containing the relational data.""" def __init__( self, regression=False, advice=False, softm=False, alpha=0.0, beta=0.0 ): """ An RFGB Data object, which serves as the structure for the positives, negatives, facts, and other parameters. adviceClauses: dictionary of advice clauses. facts: list of strings representing facts. pos: dictionary of positive examples. neg: dictionary of negative examples. examples: dictionary of examples for regression. examplesTrueValue: true value for use during regression. target: Target(s) to be learned or inferred. literals: Literals present in facts or their type specifications. variableType: Type of variable for facts and target. """ self.regression = regression self.advice = advice self.adviceClauses = {} self.facts = [] self.pos = {} self.neg = {} self.examples = {} self.examplesTrueValue = {} self.target = None self.literals = [] self.literalTypes = {} self.variableType = {} self.softm = softm self.alpha = alpha self.beta = beta
[docs] def setFacts(self, facts): """ Mutate the facts in the data object. :param facts: List of strings representing the facts. :type facts: list. :returns: None """ self.facts = facts
[docs] def getFacts(self): """returns the facts in the data""" return self.facts
[docs] def setPos(self, pos, target): """ Set positive examples based on the contents of a list. """ for example in pos: if example.split("(")[0] == target: # Set initial gradient to 0.5 for positives. self.pos[example] = 1 - Utils.sigmoid(-1.8)
[docs] def setExamples(self, examples, target): """ Set examples for regression. """ for example in examples: # Get the predicate. predicate = example.split(" ")[0] # Get the true regression value. value = float(example.split(" ")[1]) if predicate.split("(")[0] == target: # Store the true value in examplesTrueValue dictionary. self.examplesTrueValue[predicate] = value # Set the value, otherwise none. self.examples[predicate] = value
[docs] def setNeg(self, neg, target): """ Set negative examples based on the contents of a list. """ for example in neg: if example.split("(")[0] == target: # Set initial gradient to -0.5 for negative examples. self.neg[example] = -Utils.sigmoid(-1.8)
[docs] def setTarget(self, bk, target): """ Sets self.target as a target string. Sets self.variableType :param bk: List of strings representing modes. :type bk: list. :param target: Target relation or attribute. :type target: str. :returns: None Example: .. code-block:: python from rfgb.utils import Data data = Data(regression=False) background = ['friends(+person,-person)', 'friends(-person,+person)', 'smokes(+person)', 'cancer(-person)'] target = 'cancer' data.setTarget(background, target) print(data.target) # 'cancer(C)' """ # targetTypes are the types of variables in the target predicate. targetTypes = [i[:-1].split("(")[1].split(",") for i in bk if target in i][0] targetTypes = list(map(Utils.removeModeSymbols, targetTypes)) targetArity = len(targetTypes) targetVariables = sample(Utils.UniqueVariableCollection, targetArity) self.target = target + "(" for variable in targetVariables: self.target += variable + "," self.variableType[variable] = targetTypes[targetVariables.index(variable)] self.target = self.target[:-1] + ")"
[docs] def getTarget(self): """ Returns the target. """ return self.target
[docs] def getExampleTrueValue(self, example): """ Returns true regression value of an example for regression learning. """ return self.examplesTrueValue[example]
[docs] def getValue(self, example): """ Returns the regression value for an example. Example: .. code-block:: python from rfgb.utils import Utils from rfgb.utils import Data trainingData = Utils.readTrainingData('cancer', path='testDomain/ToyCancer/train/') x = trainingData.getValue('cancer(earl)') # x == -0.5, since earl doesn't have cancer. y = trainingData.getValue('cancer(alice)') # y == 0.5, since alice does have cancer """ if self.regression: return self.examples[example] if example in self.pos: return self.pos[example] else: return self.neg[example]
[docs] def setBackground(self, bk): """ Obtains the literals and their type specifications. Types can be either variable or a list of constants. """ bkWithoutTargets = [line for line in bk if "+" in line or "-" in line] # For every literal, obtain name and type specifications. for literalBk in bkWithoutTargets: literalName = literalBk.split("(")[0] literalTypeSpecification = literalBk[:-1].split("(")[1].split(",") self.literalTypes[literalName] = literalTypeSpecification self.literals.append([literalName, literalTypeSpecification])
[docs] def getLiterals(self): """gets all the literals in the facts""" return self.literals
[docs] def variance(self, examples): """ Calculates the variance of the regression values from a subset of the data. """ if not examples: return 0 total = sum([self.getValue(example) for example in examples]) numberOfExamples = len(examples) mean = total / float(numberOfExamples) sumOfSquaredError = sum( [(self.getValue(example) - mean) ** 2 for example in examples] ) return sumOfSquaredError / float(numberOfExamples) # return variance
[docs]class Utils(object): """ Class of utilities used by rfgb, such as reading files, removing mode symbols, calculating Cartesian Products, etc. """ # Attribute to store data (facts, positves, negatives) data = None UniqueVariableCollection = set(list(string.ascii_uppercase))
[docs] @staticmethod def sigmoid(x): """ :param x: Number to apply sigmoid to. :type x: int or float :returns: ``exp(x)/float(1+exp(x))`` :rtype: float """ return exp(x) / float(1 + exp(x))
[docs] @staticmethod def removeModeSymbols(inputString): """ Returns a string with the mode symbols (+,-,#) removed. Example: .. code-block:: python from rfgb.utils import Utils removeModeSymbols('#city') # == 'city' i = ['+drinks', '-drink', '-city'] o = list(map(removeModeSymbols, i)) # o == ['drinks', 'drink', 'city'] """ return inputString.replace("+", "").replace("-", "").replace("#", "")
[docs] @staticmethod def addVariableTypes(literal): """ As literals are encountered, update Utils.data.variableType with the type of the variables encountered. :param literal: A literal of the form smokes(W) or friends(A,B) :type literal: str. """ # Get the name of the literal. literalName = literal.split("(")[0] # Get background info for the literal literalTypeSpecification = Utils.data.literalTypes[literalName] # Get the arguments literalArguments = literal[:-1].split("(")[1].split(",") # Get the number of arguments. for i in range(len(literalArguments)): if literalTypeSpecification[i][0] != "[": variable = literalArguments[i] if variable not in Utils.data.variableType.keys(): Utils.data.variableType[variable] = literalTypeSpecification[i][1:]
[docs] @staticmethod def getleafValue(examples): """returns average of regression values for examples""" if not examples: return 0 total = 0 for example in examples: total += Utils.data.getValue(example) return total / float(len(examples))
[docs] @staticmethod def save(location, saveItem): """ Dumps json version of learnedDecisionTree to location. :param location: Name of the file to write. :type location: str. :returns: None. """ with codecs.open(location, encoding="utf-8", mode="w") as f: json.dump(saveItem, f, indent=2)
[docs] @staticmethod def load(location): """ Loads json version of learnedDecisionTree from location. :param location: Name of the file to load. :type location: str. :returns: None. """ with codecs.open(location, encoding="utf-8", mode="r") as f: return json.load(f)
[docs] @staticmethod def readTrainingData( target, path="train/", regression=False, advice=False, softm=False, alpha=0.0, beta=0.0, ): """ Reads the training data from files. :param target: The target predicate. :type target: str. :param path: Path to the training data. :type path: str. :param regression: Read from ``examples.txt`` instead of ``pos.txt`` and ``neg.txt``. :type regression: bool :param advice: Read advice from an advice file, which should be contained in the same directory as the examples. :type advice: bool :default path: 'train/' :default regression: False :default advice: False :returns: A Data object representing the training data. :rtype: :py:class:`.utils.Data` """ Utils.data = Data( regression=regression, advice=advice, softm=softm, alpha=alpha, beta=beta ) # trainData = Data(regression=regression, advice=advice) # Utils.data.regression = regression # Utils.data.advice = advice if advice: with open(path + "advice.txt") as fp: # read advice from train folder adviceFileLines = fp.read().splitlines() for line in adviceFileLines: adviceClause = line.split(" ")[0] # get advice clause Utils.data.adviceClauses[adviceClause] = {} # trainData.adviceClauses[adviceClause] = {} preferredTargets = line.split(" ")[1][1:-1].split(",") if preferredTargets[0]: Utils.data.adviceClauses[adviceClause][ "preferred" ] = preferredTargets # trainData.adviceClauses[adviceClause]['preferred'] = preferredTargets elif not preferredTargets[0]: Utils.data.adviceClauses[adviceClause]["preferred"] = [] # trainData.adviceClauses[adviceClause]['preferred'] = [] nonPreferredTargets = line.split(" ")[2][1:-1].split(",") if nonPreferredTargets[0]: Utils.data.adviceClauses[adviceClause][ "nonPreferred" ] = nonPreferredTargets # trainData.adviceClauses[adviceClause]['nonPreferred'] = nonPreferredTargets elif not nonPreferredTargets[0]: Utils.data.adviceClauses[adviceClause]["nonPreferred"] = [] # trainData.adviceClauses[adviceClause]['nonPreferred'] = [] with open(path + "facts.txt") as fac: Utils.data.setFacts(fac.read().splitlines()) # trainData.setFacts(fac.read().splitlines()) if regression: with open(path + "examples.txt") as exam: Utils.data.setExamples(exam.read().splitlines(), target) # trainData.setExamples(exam.read().splitlines(), target) else: with open(path + "pos.txt") as pos: Utils.data.setPos(pos.read().splitlines(), target) # trainData.setPos(pos.read().splitlines(), target) with open(path + "neg.txt") as neg: Utils.data.setNeg(neg.read().splitlines(), target) # trainData.setNeg(neg.read().splitlines(), target) with open(path + "bk.txt") as fp: bk = fp.read().splitlines() Utils.data.setBackground(bk) Utils.data.setTarget(bk, target) # trainData.setBackground(bk) # trainData.setTarget(bk, target, regression=regression) return Utils.data
# return trainData
[docs] @staticmethod def readTestData(target, path="test/", regression=False): """ Reads the testing data from files. :param target: The target predicate. :type target: str. :param path: Path to the training data. :type path: str. :param regression: Read from ``examples.txt`` instead of ``pos.txt`` and ``neg.txt``. :type regression: bool :default path: 'train/' :default regression: False :returns: A Data object representing the training data. :rtype: :py:class:`.utils.Data` """ testData = Data() testData.regression = regression with open(path + "facts.txt") as facts: testData.setFacts(facts.read().splitlines()) if regression: with open(path + "examples.txt") as exam: examples = exam.read().splitlines() testData.setExamples(examples, target) else: # If we are not using regression, read from pos.txt and neg.txt with open(path + "pos.txt") as pos: testData.setPos(pos.read().splitlines(), target) with open(path + "neg.txt") as neg: testData.setNeg(neg.read().splitlines(), target) return testData
[docs] @staticmethod def cartesianProduct(itemSets): """ Returns the Cartesian Product of all sets contained in the item sets. """ # Create new input where each element is in its own set. modifiedItemSets = [] for itemSet in itemSets: modifiedItemSet = [] for element in itemSet: modifiedItemSet.append([element]) modifiedItemSets.append(modifiedItemSet) # Perform Cartesian Product of the first two sets. while len(modifiedItemSets) > 1: set1 = modifiedItemSets[0] set2 = modifiedItemSets[1] pairWiseProducts = [] for item1 in set1: for item2 in set2: # Cartesian Product performed here. pairWiseProducts.append(item1 + item2) # Remove the first two sets. modifiedItemSets.remove(set1) modifiedItemSets.remove(set2) # Insert the Cartesian Product in their place and repeat. modifiedItemSets.insert(0, pairWiseProducts) # Return the final Cartesian Product Sets return modifiedItemSets[0]