Source code for libcity.data.dataset.trajectory_encoder.lstpm_encoder

import os
import pandas as pd
import numpy as np
import math
from libcity.data.dataset.trajectory_encoder.abstract_trajectory_encoder import AbstractTrajectoryEncoder
from libcity.utils import parse_time
from libcity.utils.dataset import parse_coordinate
from collections import defaultdict

parameter_list = ['dataset', 'min_session_len', 'min_sessions', 'traj_encoder', 'window_size', 'min_checkins',
                  'max_session_len']


[docs]def geodistance(lat1, lng1, lat2, lng2):
    lng1, lat1, lng2, lat2 = map(math.radians, [float(lng1), float(lat1), float(lng2), float(lat2)])
    dlon = lng2-lng1
    dlat = lat2-lat1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    distance = 2*math.asin(math.sqrt(a))*6371*1000
    distance = round(distance/1000, 3)
    return distance


[docs]class LstpmEncoder(AbstractTrajectoryEncoder):

    def __init__(self, config):
        super().__init__(config)
        self.uid = 0
        self.location2id = {}  # 因为原始数据集中的部分 loc id 不会被使用到因此这里需要重新编码一下
        self.id2location = {}
        self.loc_id = 0
        self.tim_max = 47  # LSTPM 做的是 48 个 time slot
        self.feature_dict = {'history_loc': 'array of int', 'history_tim': 'array of int',
                             'current_loc': 'int', 'current_tim': 'int', 'dilated_rnn_input_index': 'no_pad_int',
                             'history_avg_distance': 'no_pad_float',
                             'target': 'int', 'uid': 'int'}
        if config['evaluate_method'] == 'sample':
            self.feature_dict['neg_loc'] = 'int'
            parameter_list.append('neg_samples')
        parameters_str = ''
        for key in parameter_list:
            if key in self.config:
                parameters_str += '_' + str(self.config[key])
        self.cache_file_name = os.path.join(
            './libcity/cache/dataset_cache/', 'trajectory_{}.json'.format(parameters_str))
        self.poi_profile = pd.read_csv('./raw_data/{}/{}.geo'.format(self.config['dataset'], self.config['dataset']))
        self.time_checkin_set = defaultdict(set)

[docs]    def encode(self, uid, trajectories, negative_sample=None):
        """standard encoder use the same method as DeepMove

        Recode poi id. Encode timestamp with its hour.

        Args:
            uid ([type]): same as AbstractTrajectoryEncoder
            trajectories ([type]): same as AbstractTrajectoryEncoder
                trajectory1 = [
                    (location ID, timestamp, timezone_offset_in_minutes),
                    (location ID, timestamp, timezone_offset_in_minutes),
                    .....
                ]
        """
        # 直接对 uid 进行重编码
        uid = self.uid
        self.uid += 1
        encoded_trajectories = []
        history_loc = []
        history_loc_central = []
        history_tim = []
        for index, traj in enumerate(trajectories):
            current_loc = []
            current_tim = []
            for point in traj:
                loc = point[4]
                now_time = parse_time(point[2])
                if loc not in self.location2id:
                    self.location2id[loc] = self.loc_id
                    self.id2location[self.loc_id] = loc
                    self.loc_id += 1
                current_loc.append(self.location2id[loc])
                time_code = self._time_encode(now_time)
                current_tim.append(time_code)
                if time_code not in self.time_checkin_set:
                    self.time_checkin_set[time_code] = set()
                self.time_checkin_set[time_code].add(self.location2id[loc])
            # 完成当前轨迹的编码，下面进行输入的形成
            if index == 0:
                # 因为要历史轨迹特征，所以第一条轨迹是不能构成模型输入的
                history_loc.append(current_loc)
                history_tim.append(current_tim)
                lon = []
                lat = []
                for poi in current_loc:
                    lon_cur, lat_cur = parse_coordinate(self.poi_profile.loc[self.poi_profile['geo_id']
                                                        == self.id2location[poi]].iloc[0]['coordinates'])
                    lon.append(lon_cur)
                    lat.append(lat_cur)
                history_loc_central.append((np.mean(lat), np.mean(lon)))
                continue
            # 一条轨迹可以生成多个数据点
            for i in range(len(current_loc) - 1):
                trace = []
                target = current_loc[i+1]
                dilated_rnn_input_index = self._create_dilated_rnn_input(current_loc[:i+1])
                history_avg_distance = self._gen_distance_matrix(current_loc[:i+1], history_loc_central)
                trace.append(history_loc.copy())
                trace.append(history_tim.copy())
                trace.append(current_loc[:i+1])
                trace.append(current_tim[:i+1])
                trace.append(dilated_rnn_input_index)
                trace.append(history_avg_distance)
                trace.append(target)
                trace.append(uid)
                if negative_sample is not None:
                    neg_loc = []
                    for neg in negative_sample[index]:
                        if neg not in self.location2id:
                            self.location2id[neg] = self.loc_id
                            self.loc_id += 1
                        neg_loc.append(self.location2id[neg])
                    trace.append(neg_loc)
                encoded_trajectories.append(trace)
            history_loc.append(current_loc)
            history_tim.append(current_tim)
            # calculate current_loc
            lon = []
            lat = []
            for poi in current_loc:
                lon_cur, lat_cur = parse_coordinate(self.poi_profile.loc[self.poi_profile['geo_id']
                                                    == self.id2location[poi]].iloc[0]['coordinates'])
                lon.append(lon_cur)
                lat.append(lat_cur)
            history_loc_central.append((np.mean(lat), np.mean(lon)))
        return encoded_trajectories

[docs]    def gen_data_feature(self):
        loc_pad = self.loc_id
        tim_pad = self.tim_max + 1
        self.pad_item = {
            'current_loc': loc_pad,
            'current_tim': tim_pad
        }
        # generate time_sim_matrix
        # the pad time will not appear here
        sim_matrix = np.zeros((self.tim_max+1, self.tim_max+1))
        for i in range(self.tim_max+1):
            sim_matrix[i][i] = 1
            for j in range(i+1, self.tim_max+1):
                set_i = self.time_checkin_set[i]
                set_j = self.time_checkin_set[j]
                if len(set_i | set_j) != 0:
                    jaccard_ij = len(set_i & set_j) / len(set_i | set_j)
                    sim_matrix[i][j] = jaccard_ij
                    sim_matrix[j][i] = jaccard_ij
        self.data_feature = {
            'loc_size': self.loc_id + 1,
            'tim_size': self.tim_max + 2,
            'uid_size': self.uid,
            'loc_pad': loc_pad,
            'tim_pad': tim_pad,
            'tim_sim_matrix': sim_matrix.tolist()
        }

    def _create_dilated_rnn_input(self, current_loc):
        current_loc.reverse()
        sequence_length = len(current_loc)
        session_dilated_rnn_input_index = [0] * sequence_length
        for i in range(sequence_length - 1):
            current_poi = current_loc[i]
            poi_before = current_loc[i + 1:]
            current_poi_profile = self.poi_profile.loc[self.poi_profile['geo_id']
                                                       == self.id2location[current_poi]].iloc[0]
            lon_cur, lat_cur = parse_coordinate(current_poi_profile['coordinates'])
            distance_row_explicit = []
            for target in poi_before:
                lon, lat = parse_coordinate(self.poi_profile.loc[self.poi_profile['geo_id']
                                            == self.id2location[target]].iloc[0]['coordinates'])
                distance_row_explicit.append(geodistance(lat_cur, lon_cur, lat, lon))
            index_closet = np.argmin(distance_row_explicit).item()
            # reverse back
            session_dilated_rnn_input_index[sequence_length - i - 1] = sequence_length - 2 - index_closet - i
        current_loc.reverse()
        return session_dilated_rnn_input_index

    def _gen_distance_matrix(self, current_loc, history_loc_central):
        # 使用 profile 计算当前位置与历史轨迹中心点之间的距离
        history_avg_distance = []  # history_session_count
        now_loc = current_loc[-1]
        lon_cur, lat_cur = parse_coordinate(self.poi_profile.loc[self.poi_profile['geo_id']
                                            == self.id2location[now_loc]].iloc[0]['coordinates'])
        for central in history_loc_central:
            dis = geodistance(central[0], central[1], lat_cur, lon_cur)
            if dis < 1:
                dis = 1
            history_avg_distance.append(dis)
        return history_avg_distance

    def _time_encode(self, time):
        if time.weekday() in [0, 1, 2, 3, 4]:
            return time.hour
        else:
            return time.hour + 24