From 282a8dbee073e66e0413524df4bd5020664ecc79 Mon Sep 17 00:00:00 2001 From: wangcheng Date: Thu, 13 Dec 2018 17:02:32 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=95=B0=E6=8D=AE=E5=A4=84?= =?UTF-8?q?=E7=90=86=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data/TaxiBJ/README.md | 102 +++++ data/TaxiBJ/TaxiBJ.py | 366 ++++++++++++++++++ .../preprocessing/MaxMinNormalization.py | 42 ++ data/TaxiBJ/preprocessing/STMatrix.py | 127 ++++++ data/TaxiBJ/preprocessing/timestamp.py | 69 ++++ 5 files changed, 706 insertions(+) create mode 100644 data/TaxiBJ/README.md create mode 100644 data/TaxiBJ/TaxiBJ.py create mode 100644 data/TaxiBJ/preprocessing/MaxMinNormalization.py create mode 100644 data/TaxiBJ/preprocessing/STMatrix.py create mode 100644 data/TaxiBJ/preprocessing/timestamp.py diff --git a/data/TaxiBJ/README.md b/data/TaxiBJ/README.md new file mode 100644 index 0000000..cd4cc40 --- /dev/null +++ b/data/TaxiBJ/README.md @@ -0,0 +1,102 @@ +TaxiBJ: InFlow/OutFlow, Meteorology and Holidays at Beijing +=========================================================== + +**If you use the data, please cite the following paper.** + +`Junbo Zhang, Yu Zheng, Dekang Qi. Deep Spatio-Temporal Residual Networks for Citywide Crowd Flows Prediction. In AAAI 2017. ` + +Download data from [OneDrive](https://1drv.ms/f/s!Akh6N7xv3uVmhOhDKwx3bm5zpHkDOQ) or [BaiduYun](http://pan.baidu.com/s/1qYq7ja8) + +Please check the data with `md5sum` command: +``` +md5sum -c md5sum.txt +``` + +**TaxiBJ** consists of the following **SIX** datasets: + +* BJ16_M32x32_T30_InOut.h5 +* BJ15_M32x32_T30_InOut.h5 +* BJ14_M32x32_T30_InOut.h5 +* BJ13_M32x32_T30_InOut.h5 +* BJ_Meteorology.h5 +* BJ_Holiday.txt + +where the first four files are *crowd flows* in Beijing from the year 2013 to 2016, `BJ_Meteorology.h5` is the Meteorological data, `BJ_Holiday.txt` includes the holidays (and adjacent weekends) of Beijing. + +Note: `*.h5` is `hdf5` file, one can use the follow code to view the data: + +``` +import h5py +f = h5py.File('BJ16_M32x32_T30_InOut.h5') +for ke in f.keys(): + print(ke, f[ke].shape) +``` + +## Flows of Crowds + +File names: `BJ[YEAR]_M32x32_T30_InOut.h5`, where + +* YEAR: one of {13, 14, 15, 16} +* M32x32: the Beijing city is divided into a 32 x 32 grid map +* T30: timeslot (a.k.a. time interval) is equal to 30 minites, meaning there are 48 timeslots in a day +* InOut: Inflow/Outflow are defined in the following paper [1]. + +[1] Junbo Zhang, Yu Zheng, Dekang Qi. Deep Spatio-Temporal Residual Networks for Citywide Crowd Flows Prediction. In AAAI 2017. + +Each `*.h5` file has two following subsets: + +* `date`: a list of timeslots, which is associated the **data**. +* `data`: a 4D tensor of shape (number_of_timeslots, 2, 32, 32), of which `data[i]` is a 3D tensor of shape (2, 32, 32) at the timeslot `date[i]`, `data[i][0]` is a `32x32` inflow matrix and `data[i][1]` is a `32x32` outflow matrix. + +### Example + +You can get the data info with following command: +``` +python -c "from deepst.datasets import stat; stat('BJ16_M32x32_T30_InOut.h5')" +``` + +The output looks like: +``` +=====stat===== +data shape: (7220, 2, 32, 32) +# of days: 162, from 2015-11-01 to 2016-04-10 +# of timeslots: 7776 +# of timeslots (available): 7220 +missing ratio of timeslots: 7.2% +max: 1250.000, min: 0.000 +=====stat===== +``` + +## Meteorology + +File name: `BJ_Meteorology.h5`, which has four following subsets: + +* `date`: a list of timeslots, which is associated the following kinds of data. +* `Temperature`: a list of continuous value, of which the `i^{th}` value is `temperature` at the timeslot `date[i]`. +* `WindSpeed`: a list of continuous value, of which the `i^{th}` value is `wind speed` at the timeslot `date[i]`. +* `Weather`: a 2D matrix, each of which is a one-hot vector (`dim=17`), showing one of the following weather types: +``` +Sunny = 0, +Cloudy = 1, +Overcast = 2, +Rainy = 3, +Sprinkle = 4, +ModerateRain = 5, +HeavyRain = 6, +Rainstorm = 7, +Thunderstorm = 8, +FreezingRain = 9, +Snowy = 10, +LightSnow = 11, +ModerateSnow = 12, +HeavySnow = 13, +Foggy = 14, +Sandstorm = 15, +Dusty = 16, +``` + +## Holiday + +File name: `BJ_Holiday.txt`, which inclues a list of the holidays (and adjacent weekends) of Beijing. + +Each line a holiday with the data format [yyyy][mm][dd]. For example, `20150601` is `June 1st, 2015`. \ No newline at end of file diff --git a/data/TaxiBJ/TaxiBJ.py b/data/TaxiBJ/TaxiBJ.py new file mode 100644 index 0000000..19055ab --- /dev/null +++ b/data/TaxiBJ/TaxiBJ.py @@ -0,0 +1,366 @@ +# -*- coding: utf-8 -*- +""" + load BJ Data from multiple sources as follows: + meteorologic data +""" +from __future__ import print_function + +import os, sys + +sys.path.append('../../') +import time +import pickle +from copy import copy +import numpy as np +import h5py + +from data.TaxiBJ.preprocessing.STMatrix import STMatrix +from data.TaxiBJ.preprocessing.timestamp import timestamp2vec +from data.TaxiBJ.preprocessing.MaxMinNormalization import MinMaxNormalization + +# parameters +DATAPATH = os.path.dirname(os.path.abspath(__file__)) +CACHEPATH = os.path.join(DATAPATH, 'CACHE') + + +def load_holiday(timeslots, fname=os.path.join(DATAPATH, 'BJ_Holiday.txt')): + """ + 载入假期数据 + :param timeslots: + :param fname: + :return: + [[1],[1],[0],[0],[0]...] 当前时间片对应为假期则为1 + + """ + f = open(fname, 'r') + holidays = f.readlines() + holidays = set([h.strip() for h in holidays]) + H = np.zeros(len(timeslots)) + for i, slot in enumerate(timeslots): + if slot[:8] in holidays: + H[i] = 1 + # print(timeslots[H==1]) + return H[:, None] # 变成2维 + + +def load_meteorol(timeslots, fname=os.path.join(DATAPATH, 'BJ_Meteorology.h5')): + ''' + timeslots: the predicted timeslots + In real-world, we dont have the meteorol data in the predicted timeslot, instead, we use the meteoral at previous timeslots, i.e., slot = predicted_slot - timeslot (you can use predicted meteorol data as well) + 气象数据 + ''' + f = h5py.File(fname, 'r') + Timeslot = f['date'].value + WindSpeed = f['WindSpeed'].value + Weather = f['Weather'].value + Temperature = f['Temperature'].value + f.close() + + M = dict() # map timeslot to index + for i, slot in enumerate(Timeslot): + M[slot] = i + + WS = [] # WindSpeed + WR = [] # Weather + TE = [] # Temperature + for slot in timeslots: + predicted_id = M[slot] + cur_id = predicted_id - 1 + WS.append(WindSpeed[cur_id]) + WR.append(Weather[cur_id]) + TE.append(Temperature[cur_id]) + + WS = np.asarray(WS) + WR = np.asarray(WR) + TE = np.asarray(TE) + + # 0-1 scale + WS = 1. * (WS - WS.min()) / (WS.max() - WS.min()) + TE = 1. * (TE - TE.min()) / (TE.max() - TE.min()) + + print("meteorol shape: ", WS.shape, WR.shape, TE.shape) + + # concatenate all these attributes + merge_data = np.hstack([WR, WS[:, None], TE[:, None]]) + + # print('meger shape:', merge_data.shape) + return merge_data + + +def load_stdata(fname): + """ + split the data and date(timestamps) + :param fname: + :return: + """ + f = h5py.File(fname, 'r') + data = f['data'].value + timestamps = f['date'].value + f.close() + return data, timestamps + + +def stat(fname): + """ + count the valid data + :param fname: + :return: like below + + ==========stat========== + data shape: (7220, 2, 32, 32) + # of days: 162, from 2015-11-01 to 2016-04-10 + # of timeslots: 7776 + # of timeslots (available): 7220 + missing ratio of timeslots: 7.2% + max: 1250.000, min: 0.000 + ==========stat========== + + """ + + def get_nb_timeslot(f): + """ + count the number of timeslot of given data + :param f: + :return: + """ + s = f['date'][0] + e = f['date'][-1] + year, month, day = map(int, [s[:4], s[4:6], s[6:8]]) + ts = time.strptime("%04i-%02i-%02i" % (year, month, day), "%Y-%m-%d") + year, month, day = map(int, [e[:4], e[4:6], e[6:8]]) + te = time.strptime("%04i-%02i-%02i" % (year, month, day), "%Y-%m-%d") + nb_timeslot = (time.mktime(te) - time.mktime(ts)) / (0.5 * 3600) + 48 + time_s_str, time_e_str = time.strftime("%Y-%m-%d", ts), time.strftime("%Y-%m-%d", te) + return nb_timeslot, time_s_str, time_e_str + + with h5py.File(fname) as f: + nb_timeslot, time_s_str, time_e_str = get_nb_timeslot(f) + nb_day = int(nb_timeslot / 48) + mmax = f['data'].value.max() + mmin = f['data'].value.min() + stat = '=' * 10 + 'stat' + '=' * 10 + '\n' + \ + 'data shape: %s\n' % str(f['data'].shape) + \ + '# of days: %i, from %s to %s\n' % (nb_day, time_s_str, time_e_str) + \ + '# of timeslots: %i\n' % int(nb_timeslot) + \ + '# of timeslots (available): %i\n' % f['date'].shape[0] + \ + 'missing ratio of timeslots: %.1f%%\n' % ((1. - float(f['date'].shape[0] / nb_timeslot)) * 100) + \ + 'max: %.3f, min: %.3f\n' % (mmax, mmin) + \ + '=' * 10 + 'stat' + '=' * 10 + print(stat) + + +def remove_incomplete_days(data, timestamps, T=48): + """ + remove a certain day which has not 48 timestamps + :param data: + :param timestamps: + :param T: + :return: + """ + + days = [] # available days: some day only contain some seqs + days_incomplete = [] + i = 0 + while i < len(timestamps): + if int(timestamps[i][8:]) != 1: + i += 1 + elif i + T - 1 < len(timestamps) and int(timestamps[i + T - 1][8:]) == T: + days.append(timestamps[i][:8]) + i += T + else: + days_incomplete.append(timestamps[i][:8]) + i += 1 + print("incomplete days: ", days_incomplete) + days = set(days) + idx = [] + for i, t in enumerate(timestamps): + if t[:8] in days: + idx.append(i) + + data = data[idx] + timestamps = [timestamps[i] for i in idx] + return data, timestamps + + +def load_dataset(T=48, nb_flow=2, len_closeness=None, len_period=None, len_trend=None, + len_test=None, preprocess_name='preprocessing.pkl', + meta_data=True, meteorol_data=True, holiday_data=True): + """ + 载入处理好的数据集 + :param T: + :param nb_flow: + :param len_closeness: + :param len_period: + :param len_trend: + :param len_test: + :param preprocess_name: + :param meta_data: + :param meteorol_data: + :param holiday_data: + :return: + """ + assert (len_closeness + len_period + len_trend > 0) + # load data + # 13 - 16 + data_all = [] + timestamps_all = list() + for year in range(13, 17): + fname = os.path.join( + DATAPATH, 'BJ{}_M32x32_T30_InOut.h5'.format(year)) + print("file name: ", fname) + stat(fname) + data, timestamps = load_stdata(fname) + # print(timestamps) + # remove a certain day which does not have 48 timestamps + data, timestamps = remove_incomplete_days(data, timestamps, T) + data = data[:, :nb_flow] + data[data < 0] = 0. + data_all.append(data) + timestamps_all.append(timestamps) + print("\n") + + # minmax_scale + data_train = np.vstack(copy(data_all))[:-len_test] + print('train_data shape: ', data_train.shape) + + mmn = MinMaxNormalization() + mmn.fit(data_train) + data_all_mmn = [mmn.transform(d) for d in data_all] + fpkl = open(os.path.join(DATAPATH, CACHEPATH, preprocess_name), 'wb') + for obj in [mmn]: + pickle.dump(obj, fpkl) # 保存特征缩放模型[-1,1] + fpkl.close() + # print(len(data_all_mmn[0])) + print(timestamps_all[0][:10]) + XC, XP, XT = [], [], [] + Y = [] + timestamps_Y = [] + for data, timestamps in zip(data_all_mmn, timestamps_all): + # instance-based dataset --> sequences with format as (X, Y) where X is + # a sequence of images and Y is an image. + st = STMatrix(data, timestamps, T, CheckComplete=False) + _XC, _XP, _XT, _Y, _timestamps_Y = st.create_dataset( + len_closeness=len_closeness, len_period=len_period, len_trend=len_trend) + XC.append(_XC) + XP.append(_XP) + XT.append(_XT) + Y.append(_Y) + timestamps_Y += _timestamps_Y # [ b'2013102232', b'2013102233', b'2013102234', b'2013102235',......] + meta_feature = [] + if meta_data: + # load time feature + time_feature = timestamp2vec(timestamps_Y) # array: [?,8] + meta_feature.append(time_feature) + if holiday_data: + # load holiday + holiday_feature = load_holiday(timestamps_Y) + meta_feature.append(holiday_feature) + if meteorol_data: + # load meteorol data + meteorol_feature = load_meteorol(timestamps_Y) + meta_feature.append(meteorol_feature) + + meta_feature = np.hstack(meta_feature) if len( + meta_feature) > 0 else np.asarray(meta_feature) + metadata_dim = meta_feature.shape[1] if len( + meta_feature.shape) > 1 else None + if metadata_dim < 1: + metadata_dim = None + if meta_data and holiday_data and meteorol_data: + print('time feature:', time_feature.shape, 'holiday feature:', holiday_feature.shape, + 'meteorol feature: ', meteorol_feature.shape, 'mete feature: ', meta_feature.shape) + + XC = np.vstack(XC) # shape = [15072,6,32,32] + XP = np.vstack(XP) # shape = [15072,2,32,32] + XT = np.vstack(XT) # shape = [15072,2,32,32] + Y = np.vstack(Y) # shape = [15072,2,32,32] + XC = XC.reshape(XC.shape[0], 32, 32, -1) + XP = XP.reshape(XP.shape[0], 32, 32, -1) + XT = XT.reshape(XT.shape[0], 32, 32, -1) + Y = Y.reshape(Y.shape[0], 32, 32, -1) + print("XC shape: ", XC.shape, "XP shape: ", XP.shape, "XT shape: ", XT.shape, "Y shape:", Y.shape) + + XC_train, XP_train, XT_train, Y_train = XC[:-len_test], XP[:-len_test], XT[:-len_test], Y[:-len_test] + XC_test, XP_test, XT_test, Y_test = XC[-len_test:], XP[-len_test:], XT[-len_test:], Y[-len_test:] + timestamp_train, timestamp_test = timestamps_Y[:-len_test], timestamps_Y[-len_test:] + X_train = [] + X_test = [] + for l, X_ in zip([len_closeness, len_period, len_trend], [XC_train, XP_train, XT_train]): + if l > 0: + X_train.append(X_) + for l, X_ in zip([len_closeness, len_period, len_trend], [XC_test, XP_test, XT_test]): + if l > 0: + X_test.append(X_) + print('XC_train shape:', XC_train.shape, Y_train.shape, 'XC_test shape: ', XC_test.shape, Y_test.shape) + # + if metadata_dim is not None: + meta_feature_train, meta_feature_test = meta_feature[:-len_test], meta_feature[-len_test:] + X_train.append(meta_feature_train) + X_test.append(meta_feature_test) + + for _X in X_train: + print(_X.shape, ) + print() + for _X in X_test: + print(_X.shape, ) + print() + return X_train, Y_train, X_test, Y_test, mmn, metadata_dim, timestamp_train, timestamp_test + + +def cache(fname, X_train, Y_train, X_test, Y_test, external_dim, timestamp_train, timestamp_test): + h5 = h5py.File(fname, 'w') + h5.create_dataset('num', data=len(X_train)) + + for i, data in enumerate(X_train): + h5.create_dataset('X_train_%i' % i, data=data) + # for i, data in enumerate(Y_train): + for i, data in enumerate(X_test): + h5.create_dataset('X_test_%i' % i, data=data) + h5.create_dataset('Y_train', data=Y_train) + h5.create_dataset('Y_test', data=Y_test) + external_dim = -1 if external_dim is None else int(external_dim) + h5.create_dataset('external_dim', data=external_dim) + h5.create_dataset('T_train', data=timestamp_train) + h5.create_dataset('T_test', data=timestamp_test) + h5.close() + + +def read_cache(fname): + mmn = pickle.load(open(os.path.join(DATAPATH, CACHEPATH, 'preprocessing.pkl'), 'rb')) + f = h5py.File(fname, 'r') + num = int(f['num'].value) + X_train, Y_train, X_test, Y_test = [], [], [], [] + for i in range(num): + X_train.append(f['X_train_%i' % i].value) + X_test.append(f['X_test_%i' % i].value) + Y_train = f['Y_train'].value + Y_test = f['Y_test'].value + external_dim = f['external_dim'].value + timestamp_train = f['T_train'].value + timestamp_test = f['T_test'].value + f.close() + return X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test + + +def load_data(len_closeness, len_period, len_trend, len_test, meta_data=True, meteorol_data=True, holiday_data=True): + fname = os.path.join(DATAPATH, CACHEPATH, 'TaxiBJ_C{}_P{}_T{}.h5'.format(len_closeness, len_period, len_trend)) + if os.path.exists(fname): + X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test = read_cache( + fname) + print("load %s successfully" % fname) + else: + if os.path.isdir(CACHEPATH) is False: + os.mkdir(CACHEPATH) + X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test = \ + load_dataset(len_closeness=len_closeness, len_period=len_period, len_trend=len_trend, + len_test=len_test, meta_data=True, meteorol_data=True, holiday_data=True) + cache(fname, X_train, Y_train, X_test, Y_test, external_dim, timestamp_train, timestamp_test) + return X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test + + +if __name__ == "__main__": + # load_data(T=48, nb_flow=2, len_closeness=3, len_period=1, len_trend=1, len_test=48 * 28) + # print(DATAPATH) + # print(CACHEPATH) + X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test = \ + load_data(len_closeness=3, len_period=1, len_trend=1, len_test=28 * 48) diff --git a/data/TaxiBJ/preprocessing/MaxMinNormalization.py b/data/TaxiBJ/preprocessing/MaxMinNormalization.py new file mode 100644 index 0000000..05bc9dc --- /dev/null +++ b/data/TaxiBJ/preprocessing/MaxMinNormalization.py @@ -0,0 +1,42 @@ +# @Time : 2018/12/4 14:01 +# @Email : wangchengo@126.com +# @File : MaxMinNormalization.py +# package version: +# python 3.6 +# sklearn 0.20.0 +# numpy 1.15.2 +# tensorflow 1.5.0 +import numpy as np + +np.random.seed(1337) # for reproducibility + + +class MinMaxNormalization(object): + """ + 特征缩放,公式如下: + MinMax Normalization --> [-1, 1] + x = (x - min) / (max - min). + x = x * 2 - 1 + """ + + def __init__(self): + pass + + def fit(self, X): + self._min = X.min() + self._max = X.max() + print("min:", self._min, "max:", self._max) + + def transform(self, X): + X = 1. * (X - self._min) / (self._max - self._min) + X = X * 2. - 1. + return X + + def fit_transform(self, X): + self.fit(X) + return self.transform(X) + + def inverse_transform(self, X): + X = (X + 1.) / 2. + X = 1. * X * (self._max - self._min) + self._min + return X diff --git a/data/TaxiBJ/preprocessing/STMatrix.py b/data/TaxiBJ/preprocessing/STMatrix.py new file mode 100644 index 0000000..78b932a --- /dev/null +++ b/data/TaxiBJ/preprocessing/STMatrix.py @@ -0,0 +1,127 @@ +# @Time : 2018/12/4 14:57 +# @Email : wangchengo@126.com +# @File : STMatrix.py +# package version: +# python 3.6 +# sklearn 0.20.0 +# numpy 1.15.2 +# tensorflow 1.5.0 +import numpy as np +import pandas as pd +from .timestamp import string2timestamp + + +class STMatrix(object): + """docstring for STMatrix""" + + def __init__(self, data, timestamps, T=48, CheckComplete=True): + super(STMatrix, self).__init__() + assert len(data) == len(timestamps) + self.data = data + self.timestamps = timestamps# [b'2013070101', b'2013070102'] + self.T = T + self.pd_timestamps = string2timestamp(timestamps, T=self.T) + if CheckComplete: + self.check_complete() + # index + self.make_index() # 将时间戳:做成一个字典,也就是给每个时间戳一个序号 + + def make_index(self): + self.get_index = dict() + for i, ts in enumerate(self.pd_timestamps): + self.get_index[ts] = i + + def check_complete(self): + missing_timestamps = [] + offset = pd.DateOffset(minutes=24 * 60 // self.T) + pd_timestamps = self.pd_timestamps + i = 1 + while i < len(pd_timestamps): + if pd_timestamps[i - 1] + offset != pd_timestamps[i]: + missing_timestamps.append("(%s -- %s)" % (pd_timestamps[i - 1], pd_timestamps[i])) + i += 1 + for v in missing_timestamps: + print(v) + assert len(missing_timestamps) == 0 + + def get_matrix(self, timestamp): # 给定时间戳返回对于的数据 + return self.data[self.get_index[timestamp]] + + def save(self, fname): + pass + + def check_it(self, depends): + for d in depends: + if d not in self.get_index.keys(): + return False + return True + + def create_dataset(self, len_closeness=3, len_trend=3, TrendInterval=7, len_period=3, PeriodInterval=1): + """current version + + """ + # offset_week = pd.DateOffset(days=7) + offset_frame = pd.DateOffset(minutes=24 * 60 // self.T) # 时间偏移 minutes = 30 + XC = [] + XP = [] + XT = [] + Y = [] + timestamps_Y = [] + depends = [range(1, len_closeness + 1), + [PeriodInterval * self.T * j for j in range(1, len_period + 1)], + [TrendInterval * self.T * j for j in range(1, len_trend + 1)]] + # print depends # [range(1, 4), [48, 96, 144], [336, 672, 1008]] + i = max(self.T * TrendInterval * len_trend, self.T * PeriodInterval * len_period, len_closeness) + while i < len(self.pd_timestamps): + Flag = True + for depend in depends: + if Flag is False: + break + Flag = self.check_it([self.pd_timestamps[i] - j * offset_frame for j in depend]) + + if Flag is False: + i += 1 + continue + x_c = [self.get_matrix(self.pd_timestamps[i] - j * offset_frame) for j in depends[0]] + # 取当前时刻的前3个时间片的数据数据构成“邻近性”模块中一个输入序列 + # 例如当前时刻为[Timestamp('2013-07-01 00:00:00')] + # 则取: + # [Timestamp('2013-06-30 23:30:00'), Timestamp('2013-06-30 23:00:00'), Timestamp('2013-06-30 22:30:00')] + # 三个时刻所对应的in-out flow为一个序列 + x_p = [self.get_matrix(self.pd_timestamps[i] - j * offset_frame) for j in depends[1]] + # 取当前时刻 前 1*PeriodInterval,2*PeriodInterval,...,len_period*PeriodInterval + # 天对应时刻的in-out flow 作为一个序列,例如按默认值为 取前1、2、3天同一时刻的In-out flow + x_t = [self.get_matrix(self.pd_timestamps[i] - j * offset_frame) for j in depends[2]] + # 取当前时刻 前 1*TrendInterval,2*TrendInterval,...,len_trend*TrendInterval + # 天对应时刻的in-out flow 作为一个序列,例如按默认值为 取 前7、14、21天同一时刻的In-out flow + y = self.get_matrix(self.pd_timestamps[i]) + if len_closeness > 0: + XC.append(np.vstack(x_c)) + # a.shape=[2,32,32] b.shape=[2,32,32] c=np.vstack((a,b)) -->c.shape = [4,32,32] + if len_period > 0: + XP.append(np.vstack(x_p)) + if len_trend > 0: + XT.append(np.vstack(x_t)) + Y.append(y) + timestamps_Y.append(self.timestamps[i])#[] + i += 1 + XC = np.asarray(XC) # 模拟 邻近性的 数据 [?,6,32,32] + XP = np.asarray(XP) # 模拟 周期性的 数据 隔天 + XT = np.asarray(XT) # 模拟 趋势性的 数据 隔周 + Y = np.asarray(Y)# [?,2,32,32] + print("XC shape: ", XC.shape, "XP shape: ", XP.shape, "XT shape: ", XT.shape, "Y shape:", Y.shape) + return XC, XP, XT, Y, timestamps_Y + + +if __name__ == '__main__': + # depends = [range(1, 3 + 1), + # [1 * 48 * j for j in range(1, 3 + 1)], + # [7 * 48 * j for j in range(1, 3 + 1)]] + # print(depends) + # print([j for j in depends[0]]) + str = ['2013070101'] + t = string2timestamp(str) + offset_frame = pd.DateOffset(minutes=24 * 60 // 48) # 时间偏移 minutes = 30 + print(t) + o = [t[0] - j * offset_frame for j in range(1, 4)] + print(o) diff --git a/data/TaxiBJ/preprocessing/timestamp.py b/data/TaxiBJ/preprocessing/timestamp.py new file mode 100644 index 0000000..a1f4f88 --- /dev/null +++ b/data/TaxiBJ/preprocessing/timestamp.py @@ -0,0 +1,69 @@ +# @Time : 2018/12/5 9:28 +# @Email : wangchengo@126.com +# @File : timestamp.py +# package version: +# python 3.6 +# sklearn 0.20.0 +# numpy 1.15.2 +# tensorflow 1.5.0 + +import time +import pandas as pd +import numpy as np +from datetime import datetime + + +def string2timestamp(strings, T=48): + """ + 将字符串类型的时间转换成时间戳格式 + :param strings: + :param T: + :return: + example: + str = [b'2013070101', b'2013070102'] + print(string2timestamp(str)) + [Timestamp('2013-07-01 00:00:00'), Timestamp('2013-07-01 00:30:00')] + """ + timestamps = [] + + time_per_slot = 24.0 / T + num_per_T = T // 24 + for t in strings: + year, month, day, slot = int(t[:4]), int(t[4:6]), int(t[6:8]), int(t[8:]) - 1 + timestamps.append(pd.Timestamp(datetime(year, month, day, hour=int(slot * time_per_slot), + minute=(slot % num_per_T) * int(60.0 * time_per_slot)))) + + return timestamps + + +def timestamp2vec(timestamps): + """ + 将时间戳转换为表示星期几和工作日的向量 + :param timestamps: + :return: + exampel: + [b'2018120505', b'2018120106'] + #[[0 0 1 0 0 0 0 1] 当天是星期三,且为工作日 + [0 0 0 0 0 1 0 0]] 当天是星期六,且为休息日 + + """ + # tm_wday range [0, 6], Monday is 0 + vec = [time.strptime(str(t[:8],encoding='utf-8'), '%Y%m%d').tm_wday for t in timestamps] # python3 + # vec = [time.strptime(t[:8], '%Y%m%d').tm_wday for t in timestamps] # python2 + ret = [] + for i in vec: + v = [0 for _ in range(7)] + v[i] = 1 + if i >= 5: + v.append(0) # weekend + else: + v.append(1) # weekday + ret.append(v) + return np.asarray(ret) + + +if __name__ == "__main__": + # t = ['2013-06-30 23:30:00']# + t= [b'2018120505', b'2018120106'] + print(timestamp2vec(t)) + print([0 for _ in range(7)])