CaliGraph/py/gp.py

import numpy as np

from node2vec import Node2Vec
from sklearn.gaussian_process.kernels import Kernel, Hyperparameter
from sklearn.gaussian_process.kernels import GenericKernelMixin
from sklearn.gaussian_process import GaussianProcessRegressor
#from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.base import clone

class BookKernel(GenericKernelMixin, Kernel):
    def __init__(self, wv):
        self.wv = wv

    def _f(self, s1, s2):
        """
        kernel value between a pair of sequences
        """
        s = self.wv.similarity(s1, s2)**2*0.99 + 0.01
        if s <= 0:
            print('bad!')
        return s

    def __call__(self, X, Y=None, eval_gradient=False):
        if Y is None:
            Y = X

        if eval_gradient:
            return (
                np.array([[self._f(x, y) for y in Y] for x in X])
            )
        else:
            return np.array([[self._f(x, y) for y in Y] for x in X])
            #return np.array(self.wv.n_similarity(X, Y))

    def diag(self, X):
        return self(X)

    def is_stationary(self):
        return False

    def clone_with_theta(self, theta):
        cloned = clone(self)
        cloned.theta = theta
        return cloned

def genGprScores(G, scoreName='gpr_score', stdName='gpr_std'):
    print('[\] Constructing Feature-Space-Projector')
    node2vec = Node2Vec(G, dimensions=32, walk_length=16, num_walks=128, workers=8)
    print('[\] Fitting Embeddings for Kernel')
    model = node2vec.fit(window=8, min_count=1, batch_words=4)
    wv = model.wv
    print('[\] Constructing Kernel')
    kernel = BookKernel(wv)
    print('[\] Fitting GP')
    X, y = [], []
    for n in G.nodes:
        node = G.nodes[n]
        if 'rating' in node and node['rating']!=None:
            X.append(n)
            y.append(node['rating'])
    gpr = GaussianProcessRegressor(kernel=kernel, random_state=3141, alpha=1e-8).fit(X, y)
    print('[\] Inferencing GP')
    X = []
    for n in G.nodes:
        node = G.nodes[n]
        if not 'rating' in node or node['rating']==None:
            X.append(n)
    y, stds = gpr.predict(X, return_std=True)
    i=0
    for n in G.nodes:
        node = G.nodes[n]
        if not 'rating' in node or node['rating']==None:
            s, std = y[i], stds[i][i][0]
            i+=1
            node[scoreName], node[stdName] = float(s), float(std)