import numpy as np
from gensim.models import Word2Vec
import json
from logging import getLogger
from libcity.model.abstract_traffic_tradition_model import AbstractTraditionModel
from io import open
from time import time
from collections import defaultdict, Iterable
import random
# Reference: https://github.com/phanein/deepwalk
[docs]class Graph(defaultdict):
def __init__(self):
super(Graph, self).__init__(list)
self._logger = getLogger()
[docs] def nodes(self):
return self.keys()
[docs] def adjacency_iter(self):
return self.iteritems()
[docs] def subgraph(self, nodes={}):
subgraph = Graph()
for n in nodes:
if n in self:
subgraph[n] = [x for x in self[n] if x in nodes]
return subgraph
[docs] def make_undirected(self):
t0 = time()
for v in list(self):
for other in self[v]:
if v != other:
self[other].append(v)
t1 = time()
self._logger.info('make_directed: added missing edges {}s'.format(t1 - t0))
self.make_consistent()
return self
[docs] def make_consistent(self):
t0 = time()
# for k in iterkeys(self):
# self[k] = list(sorted(set(self[k])))
for k in self.keys():
self[k] = list(sorted(set(self[k])))
t1 = time()
self._logger.info('make_consistent: made consistent in {}s'.format(t1 - t0))
self.remove_self_loops()
return self
[docs] def remove_self_loops(self):
removed = 0
t0 = time()
for x in self:
if x in self[x]:
self[x].remove(x)
removed += 1
t1 = time()
self._logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1 - t0)))
return self
[docs] def check_self_loops(self):
for x in self:
for y in self[x]:
if x == y:
return True
return False
[docs] def has_edge(self, v1, v2):
if v2 in self[v1] or v1 in self[v2]:
return True
return False
[docs] def degree(self, nodes=None):
if isinstance(nodes, Iterable):
return {v: len(self[v]) for v in nodes}
else:
return len(self[nodes])
[docs] def order(self):
"Returns the number of nodes in the graph"
return len(self)
[docs] def number_of_edges(self):
"Returns the number of nodes in the graph"
return sum([self.degree(x) for x in self.keys()]) / 2
[docs] def number_of_nodes(self):
"Returns the number of nodes in the graph"
return self.order()
[docs] def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None):
""" Returns a truncated random walk.
path_length: Length of the random walk.
alpha: probability of restarts.
start: the start node of the random walk.
"""
G = self
if start:
path = [start]
else:
# Sampling is uniform w.r.t V, and not w.r.t E
path = [rand.choice(list(G.keys()))]
while len(path) < path_length:
cur = path[-1]
if len(G[cur]) > 0:
if rand.random() >= alpha:
path.append(rand.choice(G[cur]))
else:
path.append(path[0])
else:
break
return [str(node) for node in path]
[docs]def build_deepwalk_corpus(G, num_paths, path_length, alpha=0, rand=random.Random(0)):
walks = []
nodes = list(G.nodes())
logger = getLogger()
logger.info('Walk iteration:')
for cnt in range(num_paths):
logger.info(str(cnt + 1) + '/' + str(num_paths))
rand.shuffle(nodes)
for node in nodes:
walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))
return walks
[docs]def from_numpy(x, directed=False):
G = Graph()
for i in range(x.shape[0]):
G[i] = []
for i in range(x.shape[0]):
for j in range(x.shape[1]):
if x[i][j]:
G[i].append(j)
if not directed:
G.make_undirected()
G.make_consistent()
return G
[docs]def learn_embeddings(walks, dimensions, window_size, workers, iters, min_count=0, sg=1, hs=0):
walks = [list(map(str, walk)) for walk in walks]
model = Word2Vec(
walks, vector_size=dimensions, window=window_size, min_count=min_count, sg=sg, hs=hs,
workers=workers, epochs=iters)
return model
[docs]class DeepWalk(AbstractTraditionModel):
def __init__(self, config, data_feature):
super().__init__(config, data_feature)
self.adj_mx = data_feature.get('adj_mx')
self.num_nodes = data_feature.get('num_nodes', 1)
self.geo_to_ind = data_feature.get('geo_to_ind', None)
self.ind_to_geo = data_feature.get('ind_to_geo', None)
self._logger = getLogger()
self.output_dim = config.get('output_dim', 64)
self.is_directed = config.get('is_directed', False)
self.num_walks = config.get('num_walks', 100)
self.walk_length = config.get('walk_length', 80)
self.window_size = config.get('window_size', 10)
self.num_workers = config.get('num_workers', 10)
self.iter = config.get('max_epoch', 1000)
self.alpha = config.get('alpha', 0.0)
self.seed = config.get('seed', 0)
self.model = config.get('model', '')
self.dataset = config.get('dataset', '')
self.exp_id = config.get('exp_id', None)
self.txt_cache_file = './libcity/cache/{}/evaluate_cache/embedding_{}_{}_{}.txt'.\
format(self.exp_id, self.model, self.dataset, self.output_dim)
self.model_cache_file = './libcity/cache/{}/model_cache/embedding_{}_{}_{}.m'.\
format(self.exp_id, self.model, self.dataset, self.output_dim)
self.npy_cache_file = './libcity/cache/{}/evaluate_cache/embedding_{}_{}_{}.npy'.\
format(self.exp_id, self.model, self.dataset, self.output_dim)
[docs] def run(self, data=None):
g = from_numpy(self.adj_mx, self.is_directed)
walks = build_deepwalk_corpus(g, num_paths=self.num_walks, path_length=self.walk_length,
alpha=self.alpha, rand=random.Random(self.seed))
model = learn_embeddings(walks=walks, dimensions=self.output_dim,
window_size=self.window_size, workers=self.num_workers, iters=self.iter, hs=1)
model.wv.save_word2vec_format(self.txt_cache_file)
model.save(self.model_cache_file)
assert len(model.wv) == self.num_nodes
assert len(model.wv[0]) == self.output_dim
node_embedding = np.zeros(shape=(self.num_nodes, self.output_dim), dtype=np.float32)
f = open(self.txt_cache_file, mode='r')
lines = f.readlines()
for line in lines[1:]:
temp = line.split(' ')
index = int(temp[0])
node_embedding[index] = temp[1:]
np.save(self.npy_cache_file, node_embedding)
self._logger.info('词向量和模型保存完成')
self._logger.info('词向量维度:(' + str(len(model.wv)) + ',' + str(len(model.wv[0])) + ')')
json.dump(self.ind_to_geo, open('./libcity/cache/{}/evaluate_cache/ind_to_geo_{}.json'.format(
self.exp_id, self.dataset), 'w'))
json.dump(self.geo_to_ind, open('./libcity/cache/{}/evaluate_cache/geo_to_ind_{}.json'.format(
self.exp_id, self.dataset), 'w'))