import os
import pandas as pd
import numpy as np
import math
from libcity.data.dataset.trajectory_encoder.abstract_trajectory_encoder import AbstractTrajectoryEncoder
from libcity.utils import parse_time
from libcity.utils.dataset import parse_coordinate
from collections import defaultdict
parameter_list = ['dataset', 'min_session_len', 'min_sessions', 'traj_encoder', 'window_size', 'min_checkins',
'max_session_len']
[docs]def geodistance(lat1, lng1, lat2, lng2):
lng1, lat1, lng2, lat2 = map(math.radians, [float(lng1), float(lat1), float(lng2), float(lat2)])
dlon = lng2-lng1
dlat = lat2-lat1
a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
distance = 2*math.asin(math.sqrt(a))*6371*1000
distance = round(distance/1000, 3)
return distance
[docs]class LstpmEncoder(AbstractTrajectoryEncoder):
def __init__(self, config):
super().__init__(config)
self.uid = 0
self.location2id = {} # 因为原始数据集中的部分 loc id 不会被使用到因此这里需要重新编码一下
self.id2location = {}
self.loc_id = 0
self.tim_max = 47 # LSTPM 做的是 48 个 time slot
self.feature_dict = {'history_loc': 'array of int', 'history_tim': 'array of int',
'current_loc': 'int', 'current_tim': 'int', 'dilated_rnn_input_index': 'no_pad_int',
'history_avg_distance': 'no_pad_float',
'target': 'int', 'uid': 'int'}
if config['evaluate_method'] == 'sample':
self.feature_dict['neg_loc'] = 'int'
parameter_list.append('neg_samples')
parameters_str = ''
for key in parameter_list:
if key in self.config:
parameters_str += '_' + str(self.config[key])
self.cache_file_name = os.path.join(
'./libcity/cache/dataset_cache/', 'trajectory_{}.json'.format(parameters_str))
self.poi_profile = pd.read_csv('./raw_data/{}/{}.geo'.format(self.config['dataset'], self.config['dataset']))
self.time_checkin_set = defaultdict(set)
[docs] def encode(self, uid, trajectories, negative_sample=None):
"""standard encoder use the same method as DeepMove
Recode poi id. Encode timestamp with its hour.
Args:
uid ([type]): same as AbstractTrajectoryEncoder
trajectories ([type]): same as AbstractTrajectoryEncoder
trajectory1 = [
(location ID, timestamp, timezone_offset_in_minutes),
(location ID, timestamp, timezone_offset_in_minutes),
.....
]
"""
# 直接对 uid 进行重编码
uid = self.uid
self.uid += 1
encoded_trajectories = []
history_loc = []
history_loc_central = []
history_tim = []
for index, traj in enumerate(trajectories):
current_loc = []
current_tim = []
for point in traj:
loc = point[4]
now_time = parse_time(point[2])
if loc not in self.location2id:
self.location2id[loc] = self.loc_id
self.id2location[self.loc_id] = loc
self.loc_id += 1
current_loc.append(self.location2id[loc])
time_code = self._time_encode(now_time)
current_tim.append(time_code)
if time_code not in self.time_checkin_set:
self.time_checkin_set[time_code] = set()
self.time_checkin_set[time_code].add(self.location2id[loc])
# 完成当前轨迹的编码,下面进行输入的形成
if index == 0:
# 因为要历史轨迹特征,所以第一条轨迹是不能构成模型输入的
history_loc.append(current_loc)
history_tim.append(current_tim)
lon = []
lat = []
for poi in current_loc:
lon_cur, lat_cur = parse_coordinate(self.poi_profile.loc[self.poi_profile['geo_id']
== self.id2location[poi]].iloc[0]['coordinates'])
lon.append(lon_cur)
lat.append(lat_cur)
history_loc_central.append((np.mean(lat), np.mean(lon)))
continue
# 一条轨迹可以生成多个数据点
for i in range(len(current_loc) - 1):
trace = []
target = current_loc[i+1]
dilated_rnn_input_index = self._create_dilated_rnn_input(current_loc[:i+1])
history_avg_distance = self._gen_distance_matrix(current_loc[:i+1], history_loc_central)
trace.append(history_loc.copy())
trace.append(history_tim.copy())
trace.append(current_loc[:i+1])
trace.append(current_tim[:i+1])
trace.append(dilated_rnn_input_index)
trace.append(history_avg_distance)
trace.append(target)
trace.append(uid)
if negative_sample is not None:
neg_loc = []
for neg in negative_sample[index]:
if neg not in self.location2id:
self.location2id[neg] = self.loc_id
self.loc_id += 1
neg_loc.append(self.location2id[neg])
trace.append(neg_loc)
encoded_trajectories.append(trace)
history_loc.append(current_loc)
history_tim.append(current_tim)
# calculate current_loc
lon = []
lat = []
for poi in current_loc:
lon_cur, lat_cur = parse_coordinate(self.poi_profile.loc[self.poi_profile['geo_id']
== self.id2location[poi]].iloc[0]['coordinates'])
lon.append(lon_cur)
lat.append(lat_cur)
history_loc_central.append((np.mean(lat), np.mean(lon)))
return encoded_trajectories
[docs] def gen_data_feature(self):
loc_pad = self.loc_id
tim_pad = self.tim_max + 1
self.pad_item = {
'current_loc': loc_pad,
'current_tim': tim_pad
}
# generate time_sim_matrix
# the pad time will not appear here
sim_matrix = np.zeros((self.tim_max+1, self.tim_max+1))
for i in range(self.tim_max+1):
sim_matrix[i][i] = 1
for j in range(i+1, self.tim_max+1):
set_i = self.time_checkin_set[i]
set_j = self.time_checkin_set[j]
if len(set_i | set_j) != 0:
jaccard_ij = len(set_i & set_j) / len(set_i | set_j)
sim_matrix[i][j] = jaccard_ij
sim_matrix[j][i] = jaccard_ij
self.data_feature = {
'loc_size': self.loc_id + 1,
'tim_size': self.tim_max + 2,
'uid_size': self.uid,
'loc_pad': loc_pad,
'tim_pad': tim_pad,
'tim_sim_matrix': sim_matrix.tolist()
}
def _create_dilated_rnn_input(self, current_loc):
current_loc.reverse()
sequence_length = len(current_loc)
session_dilated_rnn_input_index = [0] * sequence_length
for i in range(sequence_length - 1):
current_poi = current_loc[i]
poi_before = current_loc[i + 1:]
current_poi_profile = self.poi_profile.loc[self.poi_profile['geo_id']
== self.id2location[current_poi]].iloc[0]
lon_cur, lat_cur = parse_coordinate(current_poi_profile['coordinates'])
distance_row_explicit = []
for target in poi_before:
lon, lat = parse_coordinate(self.poi_profile.loc[self.poi_profile['geo_id']
== self.id2location[target]].iloc[0]['coordinates'])
distance_row_explicit.append(geodistance(lat_cur, lon_cur, lat, lon))
index_closet = np.argmin(distance_row_explicit).item()
# reverse back
session_dilated_rnn_input_index[sequence_length - i - 1] = sequence_length - 2 - index_closet - i
current_loc.reverse()
return session_dilated_rnn_input_index
def _gen_distance_matrix(self, current_loc, history_loc_central):
# 使用 profile 计算当前位置与历史轨迹中心点之间的距离
history_avg_distance = [] # history_session_count
now_loc = current_loc[-1]
lon_cur, lat_cur = parse_coordinate(self.poi_profile.loc[self.poi_profile['geo_id']
== self.id2location[now_loc]].iloc[0]['coordinates'])
for central in history_loc_central:
dis = geodistance(central[0], central[1], lat_cur, lon_cur)
if dis < 1:
dis = 1
history_avg_distance.append(dis)
return history_avg_distance
def _time_encode(self, time):
if time.weekday() in [0, 1, 2, 3, 4]:
return time.hour
else:
return time.hour + 24