From 282a8dbee073e66e0413524df4bd5020664ecc79 Mon Sep 17 00:00:00 2001
From: wangcheng <wangchengo@126.com>
Date: Thu, 13 Dec 2018 17:02:32 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=95=B0=E6=8D=AE=E5=A4=84?=
 =?UTF-8?q?=E7=90=86=E6=A8=A1=E5=9D=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data/TaxiBJ/README.md                         | 102 +++++
 data/TaxiBJ/TaxiBJ.py                         | 366 ++++++++++++++++++
 .../preprocessing/MaxMinNormalization.py      |  42 ++
 data/TaxiBJ/preprocessing/STMatrix.py         | 127 ++++++
 data/TaxiBJ/preprocessing/timestamp.py        |  69 ++++
 5 files changed, 706 insertions(+)
 create mode 100644 data/TaxiBJ/README.md
 create mode 100644 data/TaxiBJ/TaxiBJ.py
 create mode 100644 data/TaxiBJ/preprocessing/MaxMinNormalization.py
 create mode 100644 data/TaxiBJ/preprocessing/STMatrix.py
 create mode 100644 data/TaxiBJ/preprocessing/timestamp.py

diff --git a/data/TaxiBJ/README.md b/data/TaxiBJ/README.md
new file mode 100644
index 0000000..cd4cc40
--- /dev/null
+++ b/data/TaxiBJ/README.md
@@ -0,0 +1,102 @@
+TaxiBJ: InFlow/OutFlow, Meteorology and Holidays at Beijing
+===========================================================
+
+**If you use the data, please cite the following paper.**
+
+`Junbo Zhang, Yu Zheng, Dekang Qi. Deep Spatio-Temporal Residual Networks for Citywide Crowd Flows Prediction. In AAAI 2017. `
+
+Download data from [OneDrive](https://1drv.ms/f/s!Akh6N7xv3uVmhOhDKwx3bm5zpHkDOQ) or [BaiduYun](http://pan.baidu.com/s/1qYq7ja8)
+
+Please check the data with `md5sum` command: 
+```
+md5sum -c md5sum.txt
+```
+
+**TaxiBJ** consists of the following **SIX** datasets:
+
+* BJ16_M32x32_T30_InOut.h5
+* BJ15_M32x32_T30_InOut.h5
+* BJ14_M32x32_T30_InOut.h5
+* BJ13_M32x32_T30_InOut.h5
+* BJ_Meteorology.h5
+* BJ_Holiday.txt
+
+where the first four files are *crowd flows* in Beijing from the year 2013 to 2016, `BJ_Meteorology.h5` is the Meteorological data, `BJ_Holiday.txt` includes the holidays (and adjacent weekends) of Beijing. 
+
+Note: `*.h5` is `hdf5` file, one can use the follow code to view the data:
+
+```
+import h5py
+f = h5py.File('BJ16_M32x32_T30_InOut.h5')
+for ke in f.keys():
+    print(ke, f[ke].shape)
+```
+
+## Flows of Crowds
+
+File names: `BJ[YEAR]_M32x32_T30_InOut.h5`, where
+
+* YEAR: one of {13, 14, 15, 16}
+* M32x32: the Beijing city is divided into a 32 x 32 grid map
+* T30: timeslot (a.k.a. time interval) is equal to 30 minites, meaning there are 48 timeslots in a day
+* InOut: Inflow/Outflow are defined in the following paper [1]. 
+
+[1] Junbo Zhang, Yu Zheng, Dekang Qi. Deep Spatio-Temporal Residual Networks for Citywide Crowd Flows Prediction. In AAAI 2017. 
+
+Each `*.h5` file has two following subsets:
+
+* `date`: a list of timeslots, which is associated the **data**. 
+* `data`: a 4D tensor of shape (number_of_timeslots, 2, 32, 32), of which `data[i]` is a 3D tensor of shape (2, 32, 32) at the timeslot `date[i]`, `data[i][0]` is a `32x32` inflow matrix and `data[i][1]` is a `32x32` outflow matrix. 
+
+### Example
+
+You can get the data info with following command: 
+```
+python -c "from deepst.datasets import stat; stat('BJ16_M32x32_T30_InOut.h5')"
+```
+
+The output looks like: 
+```
+=====stat=====
+data shape: (7220, 2, 32, 32)
+# of days: 162, from 2015-11-01 to 2016-04-10
+# of timeslots: 7776
+# of timeslots (available): 7220
+missing ratio of timeslots: 7.2%
+max: 1250.000, min: 0.000
+=====stat=====
+```
+
+## Meteorology
+
+File name: `BJ_Meteorology.h5`, which has four following subsets:
+
+* `date`: a list of timeslots, which is associated the following kinds of data. 
+* `Temperature`: a list of continuous value, of which the `i^{th}` value is `temperature` at the timeslot `date[i]`.
+* `WindSpeed`: a list of continuous value, of which the `i^{th}` value is `wind speed` at the timeslot `date[i]`. 
+* `Weather`: a 2D matrix, each of which is a one-hot vector (`dim=17`), showing one of the following weather types: 
+```
+Sunny = 0,  
+Cloudy = 1, 
+Overcast = 2, 
+Rainy = 3, 
+Sprinkle = 4,  
+ModerateRain = 5,  
+HeavyRain = 6, 
+Rainstorm = 7, 
+Thunderstorm = 8, 
+FreezingRain = 9, 
+Snowy = 10,  
+LightSnow = 11, 
+ModerateSnow = 12, 
+HeavySnow = 13, 
+Foggy = 14,  
+Sandstorm = 15, 
+Dusty = 16, 
+```
+
+## Holiday
+
+File name: `BJ_Holiday.txt`, which inclues a list of the holidays (and adjacent weekends) of Beijing. 
+
+Each line a holiday with the data format [yyyy][mm][dd]. For example, `20150601` is `June 1st, 2015`. 
\ No newline at end of file
diff --git a/data/TaxiBJ/TaxiBJ.py b/data/TaxiBJ/TaxiBJ.py
new file mode 100644
index 0000000..19055ab
--- /dev/null
+++ b/data/TaxiBJ/TaxiBJ.py
@@ -0,0 +1,366 @@
+# -*- coding: utf-8 -*-
+"""
+    load BJ Data from multiple sources as follows:
+        meteorologic data
+"""
+from __future__ import print_function
+
+import os, sys
+
+sys.path.append('../../')
+import time
+import pickle
+from copy import copy
+import numpy as np
+import h5py
+
+from data.TaxiBJ.preprocessing.STMatrix import STMatrix
+from data.TaxiBJ.preprocessing.timestamp import timestamp2vec
+from data.TaxiBJ.preprocessing.MaxMinNormalization import MinMaxNormalization
+
+# parameters
+DATAPATH = os.path.dirname(os.path.abspath(__file__))
+CACHEPATH = os.path.join(DATAPATH, 'CACHE')
+
+
+def load_holiday(timeslots, fname=os.path.join(DATAPATH, 'BJ_Holiday.txt')):
+    """
+    载入假期数据
+    :param timeslots:
+    :param fname:
+    :return:
+    [[1],[1],[0],[0],[0]...] 当前时间片对应为假期则为1
+
+    """
+    f = open(fname, 'r')
+    holidays = f.readlines()
+    holidays = set([h.strip() for h in holidays])
+    H = np.zeros(len(timeslots))
+    for i, slot in enumerate(timeslots):
+        if slot[:8] in holidays:
+            H[i] = 1
+    # print(timeslots[H==1])
+    return H[:, None]  # 变成2维
+
+
+def load_meteorol(timeslots, fname=os.path.join(DATAPATH, 'BJ_Meteorology.h5')):
+    '''
+    timeslots: the predicted timeslots
+    In real-world, we dont have the meteorol data in the predicted timeslot, instead, we use the meteoral at previous timeslots, i.e., slot = predicted_slot - timeslot (you can use predicted meteorol data as well)
+    气象数据
+    '''
+    f = h5py.File(fname, 'r')
+    Timeslot = f['date'].value
+    WindSpeed = f['WindSpeed'].value
+    Weather = f['Weather'].value
+    Temperature = f['Temperature'].value
+    f.close()
+
+    M = dict()  # map timeslot to index
+    for i, slot in enumerate(Timeslot):
+        M[slot] = i
+
+    WS = []  # WindSpeed
+    WR = []  # Weather
+    TE = []  # Temperature
+    for slot in timeslots:
+        predicted_id = M[slot]
+        cur_id = predicted_id - 1
+        WS.append(WindSpeed[cur_id])
+        WR.append(Weather[cur_id])
+        TE.append(Temperature[cur_id])
+
+    WS = np.asarray(WS)
+    WR = np.asarray(WR)
+    TE = np.asarray(TE)
+
+    # 0-1 scale
+    WS = 1. * (WS - WS.min()) / (WS.max() - WS.min())
+    TE = 1. * (TE - TE.min()) / (TE.max() - TE.min())
+
+    print("meteorol shape: ", WS.shape, WR.shape, TE.shape)
+
+    # concatenate all these attributes
+    merge_data = np.hstack([WR, WS[:, None], TE[:, None]])
+
+    # print('meger shape:', merge_data.shape)
+    return merge_data
+
+
+def load_stdata(fname):
+    """
+    split the data and date(timestamps)
+    :param fname:
+    :return:
+    """
+    f = h5py.File(fname, 'r')
+    data = f['data'].value
+    timestamps = f['date'].value
+    f.close()
+    return data, timestamps
+
+
+def stat(fname):
+    """
+    count the valid data
+    :param fname:
+    :return: like below
+
+    ==========stat==========
+    data shape: (7220, 2, 32, 32)
+    # of days: 162, from 2015-11-01 to 2016-04-10
+    # of timeslots: 7776
+    # of timeslots (available): 7220
+    missing ratio of timeslots: 7.2%
+    max: 1250.000, min: 0.000
+    ==========stat==========
+
+    """
+
+    def get_nb_timeslot(f):
+        """
+        count the number of timeslot of given data
+        :param f:
+        :return:
+        """
+        s = f['date'][0]
+        e = f['date'][-1]
+        year, month, day = map(int, [s[:4], s[4:6], s[6:8]])
+        ts = time.strptime("%04i-%02i-%02i" % (year, month, day), "%Y-%m-%d")
+        year, month, day = map(int, [e[:4], e[4:6], e[6:8]])
+        te = time.strptime("%04i-%02i-%02i" % (year, month, day), "%Y-%m-%d")
+        nb_timeslot = (time.mktime(te) - time.mktime(ts)) / (0.5 * 3600) + 48
+        time_s_str, time_e_str = time.strftime("%Y-%m-%d", ts), time.strftime("%Y-%m-%d", te)
+        return nb_timeslot, time_s_str, time_e_str
+
+    with h5py.File(fname) as f:
+        nb_timeslot, time_s_str, time_e_str = get_nb_timeslot(f)
+        nb_day = int(nb_timeslot / 48)
+        mmax = f['data'].value.max()
+        mmin = f['data'].value.min()
+        stat = '=' * 10 + 'stat' + '=' * 10 + '\n' + \
+               'data shape: %s\n' % str(f['data'].shape) + \
+               '# of days: %i, from %s to %s\n' % (nb_day, time_s_str, time_e_str) + \
+               '# of timeslots: %i\n' % int(nb_timeslot) + \
+               '# of timeslots (available): %i\n' % f['date'].shape[0] + \
+               'missing ratio of timeslots: %.1f%%\n' % ((1. - float(f['date'].shape[0] / nb_timeslot)) * 100) + \
+               'max: %.3f, min: %.3f\n' % (mmax, mmin) + \
+               '=' * 10 + 'stat' + '=' * 10
+        print(stat)
+
+
+def remove_incomplete_days(data, timestamps, T=48):
+    """
+    remove a certain day which has not 48 timestamps
+    :param data:
+    :param timestamps:
+    :param T:
+    :return:
+    """
+
+    days = []  # available days: some day only contain some seqs
+    days_incomplete = []
+    i = 0
+    while i < len(timestamps):
+        if int(timestamps[i][8:]) != 1:
+            i += 1
+        elif i + T - 1 < len(timestamps) and int(timestamps[i + T - 1][8:]) == T:
+            days.append(timestamps[i][:8])
+            i += T
+        else:
+            days_incomplete.append(timestamps[i][:8])
+            i += 1
+    print("incomplete days: ", days_incomplete)
+    days = set(days)
+    idx = []
+    for i, t in enumerate(timestamps):
+        if t[:8] in days:
+            idx.append(i)
+
+    data = data[idx]
+    timestamps = [timestamps[i] for i in idx]
+    return data, timestamps
+
+
+def load_dataset(T=48, nb_flow=2, len_closeness=None, len_period=None, len_trend=None,
+                 len_test=None, preprocess_name='preprocessing.pkl',
+                 meta_data=True, meteorol_data=True, holiday_data=True):
+    """
+    载入处理好的数据集
+    :param T:
+    :param nb_flow:
+    :param len_closeness:
+    :param len_period:
+    :param len_trend:
+    :param len_test:
+    :param preprocess_name:
+    :param meta_data:
+    :param meteorol_data:
+    :param holiday_data:
+    :return:
+    """
+    assert (len_closeness + len_period + len_trend > 0)
+    # load data
+    # 13 - 16
+    data_all = []
+    timestamps_all = list()
+    for year in range(13, 17):
+        fname = os.path.join(
+            DATAPATH, 'BJ{}_M32x32_T30_InOut.h5'.format(year))
+        print("file name: ", fname)
+        stat(fname)
+        data, timestamps = load_stdata(fname)
+        # print(timestamps)
+        # remove a certain day which does not have 48 timestamps
+        data, timestamps = remove_incomplete_days(data, timestamps, T)
+        data = data[:, :nb_flow]
+        data[data < 0] = 0.
+        data_all.append(data)
+        timestamps_all.append(timestamps)
+        print("\n")
+
+    # minmax_scale
+    data_train = np.vstack(copy(data_all))[:-len_test]
+    print('train_data shape: ', data_train.shape)
+
+    mmn = MinMaxNormalization()
+    mmn.fit(data_train)
+    data_all_mmn = [mmn.transform(d) for d in data_all]
+    fpkl = open(os.path.join(DATAPATH, CACHEPATH, preprocess_name), 'wb')
+    for obj in [mmn]:
+        pickle.dump(obj, fpkl)  # 保存特征缩放模型[-1,1]
+    fpkl.close()
+    # print(len(data_all_mmn[0]))
+    print(timestamps_all[0][:10])
+    XC, XP, XT = [], [], []
+    Y = []
+    timestamps_Y = []
+    for data, timestamps in zip(data_all_mmn, timestamps_all):
+        # instance-based dataset --> sequences with format as (X, Y) where X is
+        # a sequence of images and Y is an image.
+        st = STMatrix(data, timestamps, T, CheckComplete=False)
+        _XC, _XP, _XT, _Y, _timestamps_Y = st.create_dataset(
+            len_closeness=len_closeness, len_period=len_period, len_trend=len_trend)
+        XC.append(_XC)
+        XP.append(_XP)
+        XT.append(_XT)
+        Y.append(_Y)
+        timestamps_Y += _timestamps_Y  # [ b'2013102232', b'2013102233', b'2013102234', b'2013102235',......]
+    meta_feature = []
+    if meta_data:
+        # load time feature
+        time_feature = timestamp2vec(timestamps_Y)  # array: [?,8]
+        meta_feature.append(time_feature)
+    if holiday_data:
+        # load holiday
+        holiday_feature = load_holiday(timestamps_Y)
+        meta_feature.append(holiday_feature)
+    if meteorol_data:
+        # load meteorol data
+        meteorol_feature = load_meteorol(timestamps_Y)
+        meta_feature.append(meteorol_feature)
+
+    meta_feature = np.hstack(meta_feature) if len(
+        meta_feature) > 0 else np.asarray(meta_feature)
+    metadata_dim = meta_feature.shape[1] if len(
+        meta_feature.shape) > 1 else None
+    if metadata_dim < 1:
+        metadata_dim = None
+    if meta_data and holiday_data and meteorol_data:
+        print('time feature:', time_feature.shape, 'holiday feature:', holiday_feature.shape,
+              'meteorol feature: ', meteorol_feature.shape, 'mete feature: ', meta_feature.shape)
+
+    XC = np.vstack(XC)  # shape = [15072,6,32,32]
+    XP = np.vstack(XP)  # shape = [15072,2,32,32]
+    XT = np.vstack(XT)  # shape = [15072,2,32,32]
+    Y = np.vstack(Y)  # shape = [15072,2,32,32]
+    XC = XC.reshape(XC.shape[0], 32, 32, -1)
+    XP = XP.reshape(XP.shape[0], 32, 32, -1)
+    XT = XT.reshape(XT.shape[0], 32, 32, -1)
+    Y = Y.reshape(Y.shape[0], 32, 32, -1)
+    print("XC shape: ", XC.shape, "XP shape: ", XP.shape, "XT shape: ", XT.shape, "Y shape:", Y.shape)
+
+    XC_train, XP_train, XT_train, Y_train = XC[:-len_test], XP[:-len_test], XT[:-len_test], Y[:-len_test]
+    XC_test, XP_test, XT_test, Y_test = XC[-len_test:], XP[-len_test:], XT[-len_test:], Y[-len_test:]
+    timestamp_train, timestamp_test = timestamps_Y[:-len_test], timestamps_Y[-len_test:]
+    X_train = []
+    X_test = []
+    for l, X_ in zip([len_closeness, len_period, len_trend], [XC_train, XP_train, XT_train]):
+        if l > 0:
+            X_train.append(X_)
+    for l, X_ in zip([len_closeness, len_period, len_trend], [XC_test, XP_test, XT_test]):
+        if l > 0:
+            X_test.append(X_)
+    print('XC_train shape:', XC_train.shape, Y_train.shape, 'XC_test shape: ', XC_test.shape, Y_test.shape)
+    #
+    if metadata_dim is not None:
+        meta_feature_train, meta_feature_test = meta_feature[:-len_test], meta_feature[-len_test:]
+        X_train.append(meta_feature_train)
+        X_test.append(meta_feature_test)
+
+    for _X in X_train:
+        print(_X.shape, )
+    print()
+    for _X in X_test:
+        print(_X.shape, )
+    print()
+    return X_train, Y_train, X_test, Y_test, mmn, metadata_dim, timestamp_train, timestamp_test
+
+
+def cache(fname, X_train, Y_train, X_test, Y_test, external_dim, timestamp_train, timestamp_test):
+    h5 = h5py.File(fname, 'w')
+    h5.create_dataset('num', data=len(X_train))
+
+    for i, data in enumerate(X_train):
+        h5.create_dataset('X_train_%i' % i, data=data)
+    # for i, data in enumerate(Y_train):
+    for i, data in enumerate(X_test):
+        h5.create_dataset('X_test_%i' % i, data=data)
+    h5.create_dataset('Y_train', data=Y_train)
+    h5.create_dataset('Y_test', data=Y_test)
+    external_dim = -1 if external_dim is None else int(external_dim)
+    h5.create_dataset('external_dim', data=external_dim)
+    h5.create_dataset('T_train', data=timestamp_train)
+    h5.create_dataset('T_test', data=timestamp_test)
+    h5.close()
+
+
+def read_cache(fname):
+    mmn = pickle.load(open(os.path.join(DATAPATH, CACHEPATH, 'preprocessing.pkl'), 'rb'))
+    f = h5py.File(fname, 'r')
+    num = int(f['num'].value)
+    X_train, Y_train, X_test, Y_test = [], [], [], []
+    for i in range(num):
+        X_train.append(f['X_train_%i' % i].value)
+        X_test.append(f['X_test_%i' % i].value)
+    Y_train = f['Y_train'].value
+    Y_test = f['Y_test'].value
+    external_dim = f['external_dim'].value
+    timestamp_train = f['T_train'].value
+    timestamp_test = f['T_test'].value
+    f.close()
+    return X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test
+
+
+def load_data(len_closeness, len_period, len_trend, len_test, meta_data=True, meteorol_data=True, holiday_data=True):
+    fname = os.path.join(DATAPATH, CACHEPATH, 'TaxiBJ_C{}_P{}_T{}.h5'.format(len_closeness, len_period, len_trend))
+    if os.path.exists(fname):
+        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test = read_cache(
+            fname)
+        print("load %s successfully" % fname)
+    else:
+        if os.path.isdir(CACHEPATH) is False:
+            os.mkdir(CACHEPATH)
+        X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test = \
+            load_dataset(len_closeness=len_closeness, len_period=len_period, len_trend=len_trend,
+                         len_test=len_test, meta_data=True, meteorol_data=True, holiday_data=True)
+        cache(fname, X_train, Y_train, X_test, Y_test, external_dim, timestamp_train, timestamp_test)
+    return X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test
+
+
+if __name__ == "__main__":
+    # load_data(T=48, nb_flow=2, len_closeness=3, len_period=1, len_trend=1, len_test=48 * 28)
+    # print(DATAPATH)
+    # print(CACHEPATH)
+    X_train, Y_train, X_test, Y_test, mmn, external_dim, timestamp_train, timestamp_test = \
+        load_data(len_closeness=3, len_period=1, len_trend=1, len_test=28 * 48)
diff --git a/data/TaxiBJ/preprocessing/MaxMinNormalization.py b/data/TaxiBJ/preprocessing/MaxMinNormalization.py
new file mode 100644
index 0000000..05bc9dc
--- /dev/null
+++ b/data/TaxiBJ/preprocessing/MaxMinNormalization.py
@@ -0,0 +1,42 @@
+# @Time    : 2018/12/4 14:01
+# @Email  : wangchengo@126.com
+# @File   : MaxMinNormalization.py
+# package version:
+#               python 3.6
+#               sklearn 0.20.0
+#               numpy 1.15.2
+#               tensorflow 1.5.0
+import numpy as np
+
+np.random.seed(1337)  # for reproducibility
+
+
+class MinMaxNormalization(object):
+    """
+    特征缩放，公式如下：
+        MinMax Normalization --> [-1, 1]
+        x = (x - min) / (max - min).
+        x = x * 2 - 1
+    """
+
+    def __init__(self):
+        pass
+
+    def fit(self, X):
+        self._min = X.min()
+        self._max = X.max()
+        print("min:", self._min, "max:", self._max)
+
+    def transform(self, X):
+        X = 1. * (X - self._min) / (self._max - self._min)
+        X = X * 2. - 1.
+        return X
+
+    def fit_transform(self, X):
+        self.fit(X)
+        return self.transform(X)
+
+    def inverse_transform(self, X):
+        X = (X + 1.) / 2.
+        X = 1. * X * (self._max - self._min) + self._min
+        return X
diff --git a/data/TaxiBJ/preprocessing/STMatrix.py b/data/TaxiBJ/preprocessing/STMatrix.py
new file mode 100644
index 0000000..78b932a
--- /dev/null
+++ b/data/TaxiBJ/preprocessing/STMatrix.py
@@ -0,0 +1,127 @@
+# @Time    : 2018/12/4 14:57
+# @Email  : wangchengo@126.com
+# @File   : STMatrix.py
+# package version:
+#               python 3.6
+#               sklearn 0.20.0
+#               numpy 1.15.2
+#               tensorflow 1.5.0
+import numpy as np
+import pandas as pd
+from .timestamp import string2timestamp
+
+
+class STMatrix(object):
+    """docstring for STMatrix"""
+
+    def __init__(self, data, timestamps, T=48, CheckComplete=True):
+        super(STMatrix, self).__init__()
+        assert len(data) == len(timestamps)
+        self.data = data
+        self.timestamps = timestamps# [b'2013070101', b'2013070102']
+        self.T = T
+        self.pd_timestamps = string2timestamp(timestamps, T=self.T)
+        if CheckComplete:
+            self.check_complete()
+        # index
+        self.make_index()  # 将时间戳：做成一个字典，也就是给每个时间戳一个序号
+
+    def make_index(self):
+        self.get_index = dict()
+        for i, ts in enumerate(self.pd_timestamps):
+            self.get_index[ts] = i
+
+    def check_complete(self):
+        missing_timestamps = []
+        offset = pd.DateOffset(minutes=24 * 60 // self.T)
+        pd_timestamps = self.pd_timestamps
+        i = 1
+        while i < len(pd_timestamps):
+            if pd_timestamps[i - 1] + offset != pd_timestamps[i]:
+                missing_timestamps.append("(%s -- %s)" % (pd_timestamps[i - 1], pd_timestamps[i]))
+            i += 1
+        for v in missing_timestamps:
+            print(v)
+        assert len(missing_timestamps) == 0
+
+    def get_matrix(self, timestamp):  # 给定时间戳返回对于的数据
+        return self.data[self.get_index[timestamp]]
+
+    def save(self, fname):
+        pass
+
+    def check_it(self, depends):
+        for d in depends:
+            if d not in self.get_index.keys():
+                return False
+        return True
+
+    def create_dataset(self, len_closeness=3, len_trend=3, TrendInterval=7, len_period=3, PeriodInterval=1):
+        """current version
+
+        """
+        # offset_week = pd.DateOffset(days=7)
+        offset_frame = pd.DateOffset(minutes=24 * 60 // self.T)  # 时间偏移 minutes = 30
+        XC = []
+        XP = []
+        XT = []
+        Y = []
+        timestamps_Y = []
+        depends = [range(1, len_closeness + 1),
+                   [PeriodInterval * self.T * j for j in range(1, len_period + 1)],
+                   [TrendInterval * self.T * j for j in range(1, len_trend + 1)]]
+        # print depends # [range(1, 4), [48, 96, 144], [336, 672, 1008]]
+        i = max(self.T * TrendInterval * len_trend, self.T * PeriodInterval * len_period, len_closeness)
+        while i < len(self.pd_timestamps):
+            Flag = True
+            for depend in depends:
+                if Flag is False:
+                    break
+                Flag = self.check_it([self.pd_timestamps[i] - j * offset_frame for j in depend])
+
+            if Flag is False:
+                i += 1
+                continue
+            x_c = [self.get_matrix(self.pd_timestamps[i] - j * offset_frame) for j in depends[0]]
+            # 取当前时刻的前3个时间片的数据数据构成“邻近性”模块中一个输入序列
+            # 例如当前时刻为[Timestamp('2013-07-01 00:00:00')]
+            # 则取：
+            # [Timestamp('2013-06-30 23:30:00'), Timestamp('2013-06-30 23:00:00'), Timestamp('2013-06-30 22:30:00')]
+            #  三个时刻所对应的in-out flow为一个序列
+            x_p = [self.get_matrix(self.pd_timestamps[i] - j * offset_frame) for j in depends[1]]
+            # 取当前时刻 前 1*PeriodInterval,2*PeriodInterval,...,len_period*PeriodInterval
+            # 天对应时刻的in-out flow 作为一个序列，例如按默认值为 取前1、2、3天同一时刻的In-out flow
+            x_t = [self.get_matrix(self.pd_timestamps[i] - j * offset_frame) for j in depends[2]]
+            # 取当前时刻 前 1*TrendInterval,2*TrendInterval,...,len_trend*TrendInterval
+            # 天对应时刻的in-out flow 作为一个序列,例如按默认值为 取 前7、14、21天同一时刻的In-out flow
+            y = self.get_matrix(self.pd_timestamps[i])
+            if len_closeness > 0:
+                XC.append(np.vstack(x_c))
+                # a.shape=[2,32,32] b.shape=[2,32,32] c=np.vstack((a,b)) -->c.shape = [4,32,32]
+            if len_period > 0:
+                XP.append(np.vstack(x_p))
+            if len_trend > 0:
+                XT.append(np.vstack(x_t))
+            Y.append(y)
+            timestamps_Y.append(self.timestamps[i])#[]
+            i += 1
+        XC = np.asarray(XC)  # 模拟 邻近性的 数据 [?,6,32,32]
+        XP = np.asarray(XP)  # 模拟 周期性的 数据 隔天
+        XT = np.asarray(XT)  # 模拟 趋势性的 数据 隔周
+        Y = np.asarray(Y)# [?,2,32,32]
+        print("XC shape: ", XC.shape, "XP shape: ", XP.shape, "XT shape: ", XT.shape, "Y shape:", Y.shape)
+        return XC, XP, XT, Y, timestamps_Y
+
+
+if __name__ == '__main__':
+    # depends = [range(1, 3 + 1),
+    #            [1 * 48 * j for j in range(1, 3 + 1)],
+    #            [7 * 48 * j for j in range(1, 3 + 1)]]
+    # print(depends)
+    # print([j for j in depends[0]])
+    str = ['2013070101']
+    t = string2timestamp(str)
+    offset_frame = pd.DateOffset(minutes=24 * 60 // 48)  # 时间偏移 minutes = 30
+    print(t)
+    o = [t[0] - j * offset_frame for j in range(1, 4)]
+    print(o)
diff --git a/data/TaxiBJ/preprocessing/timestamp.py b/data/TaxiBJ/preprocessing/timestamp.py
new file mode 100644
index 0000000..a1f4f88
--- /dev/null
+++ b/data/TaxiBJ/preprocessing/timestamp.py
@@ -0,0 +1,69 @@
+# @Time    : 2018/12/5 9:28
+# @Email  : wangchengo@126.com
+# @File   : timestamp.py
+# package version:
+#               python 3.6
+#               sklearn 0.20.0
+#               numpy 1.15.2
+#               tensorflow 1.5.0
+
+import time
+import pandas as pd
+import numpy as np
+from datetime import datetime
+
+
+def string2timestamp(strings, T=48):
+    """
+    将字符串类型的时间转换成时间戳格式
+    :param strings:
+    :param T:
+    :return:
+    example:
+    str = [b'2013070101', b'2013070102']
+    print(string2timestamp(str))
+    [Timestamp('2013-07-01 00:00:00'), Timestamp('2013-07-01 00:30:00')]
+    """
+    timestamps = []
+
+    time_per_slot = 24.0 / T
+    num_per_T = T // 24
+    for t in strings:
+        year, month, day, slot = int(t[:4]), int(t[4:6]), int(t[6:8]), int(t[8:]) - 1
+        timestamps.append(pd.Timestamp(datetime(year, month, day, hour=int(slot * time_per_slot),
+                                                minute=(slot % num_per_T) * int(60.0 * time_per_slot))))
+
+    return timestamps
+
+
+def timestamp2vec(timestamps):
+    """
+    将时间戳转换为表示星期几和工作日的向量
+    :param timestamps:
+    :return:
+    exampel:
+    [b'2018120505', b'2018120106']
+    #[[0 0 1 0 0 0 0 1]  当天是星期三，且为工作日
+     [0 0 0 0 0 1 0 0]]  当天是星期六，且为休息日
+
+    """
+    # tm_wday range [0, 6], Monday is 0
+    vec = [time.strptime(str(t[:8],encoding='utf-8'), '%Y%m%d').tm_wday for t in timestamps]  # python3
+    # vec = [time.strptime(t[:8], '%Y%m%d').tm_wday for t in timestamps]  # python2
+    ret = []
+    for i in vec:
+        v = [0 for _ in range(7)]
+        v[i] = 1
+        if i >= 5:
+            v.append(0)  # weekend
+        else:
+            v.append(1)  # weekday
+        ret.append(v)
+    return np.asarray(ret)
+
+
+if __name__ == "__main__":
+    # t = ['2013-06-30 23:30:00']#
+    t= [b'2018120505', b'2018120106']
+    print(timestamp2vec(t))
+    print([0 for _ in range(7)])