PaddlePaddle · frankwhzhang · May 11, 2022 · May 6, 2022 · May 6, 2022 · May 6, 2022
diff --git a/README_CN.md b/README_CN.md
diff --git a/README_EN.md b/README_EN.md
@@ -159,7 +159,8 @@ python -u tools/static_trainer.py -m models/rank/dnn/config.yaml #  Training wit
   |         Rank          |                     [FLEN](models/rank/flen/)                     |  -  |         ✓         |     ✓     |  >=2.1.0 | [2019][FLEN: Leveraging Field for Scalable CTR Prediction]( https://arxiv.org/pdf/1911.04690.pdf)                                                                                                           |
   |   Rank   |                     [DeepRec](models/rank/deeprec/)                     |  -  |       ✓     |     ✓     | >=2.1.0 | [2017][Training Deep AutoEncoders for Collaborative Filtering](https://arxiv.org/pdf/1708.01715v3.pdf)                                                                                                          |
   |   Rank   |                     [AutoFIS](models/rank/autofis/)                     |  -  |       ✓     |     ✓     | >=2.1.0 | [KDD 2020][AutoFIS: Automatic Feature Interaction Selection in Factorization Models for Click-Through Rate Prediction](https://arxiv.org/pdf/2003.11235v3.pdf)                                                                                                          |
-  |   Rank   |                     [DCN_V2](models/rank/dcn_v2/)                     |  -  |       ✓     |     ✓     | >=2.1.0 | [WWW 2021][DCN V2: Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems](https://arxiv.org/pdf/2008.13535v2.pdf) 
+  |   Rank   |                     [DCN_V2](models/rank/dcn_v2/)                     |  -  |       ✓     |     ✓     | >=2.1.0 | [WWW 2021][DCN V2: Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems](https://arxiv.org/pdf/2008.13535v2.pdf)|
+  |   Rank   |                                                                          [AITM](models/rank/aitm/)                                                                          |  -  |       ✓     |     ✓     | >=2.1.0 | [KDD 2021][Modeling the Sequential Dependence among Audience Multi-step Conversions withMulti-task Learning in Targeted Display Advertising](https://arxiv.org/pdf/2105.08489v2.pdf)  |
   |      Multi-Task       |                  [PLE](models/multitask/ple/)<br>([doc](https://paddlerec.readthedocs.io/en/latest/models/multitask/ple.html))                   |  [Python CPU/GPU](https://aistudio.baidu.com/aistudio/projectdetail/3238938)  |     ✓     |     ✓     |  >=2.1.0 | [RecSys 2020][Progressive Layered Extraction (PLE): A Novel Multi-Task Learning (MTL) Model for Personalized Recommendations](https://dl.acm.org/doi/abs/10.1145/3383313.3412236)                                                              |
   |      Multi-Task       |                  [ESMM](models/multitask/esmm/)<br>([doc](https://paddlerec.readthedocs.io/en/latest/models/multitask/esmm.html))                   |  [Python CPU/GPU](https://aistudio.baidu.com/aistudio/projectdetail/3238583)  |         ✓         |     ✓     |      >=2.1.0     | [SIGIR 2018][Entire Space Multi-Task Model: An Effective Approach for Estimating Post-Click Conversion Rate](https://arxiv.org/abs/1804.07931)                                                              |
   |      Multi-Task       |                  [MMOE](models/multitask/mmoe/)<br>([doc](https://paddlerec.readthedocs.io/en/latest/models/multitask/mmoe.html))                   |  [Python CPU/GPU](https://aistudio.baidu.com/aistudio/projectdetail/3238934)  |         ✓         |     ✓     |      >=2.1.0     | [KDD 2018][Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts](https://dl.acm.org/doi/abs/10.1145/3219819.3220007)                                                       |

diff --git a/datasets/ali-cpp_aitm/process_public_data.py b/datasets/ali-cpp_aitm/process_public_data.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+process the Ali-CCP (Alibaba Click and Conversion Prediction) dataset.
+https://tianchi.aliyun.com/datalab/dataSet.html?dataId=408
+
+@The author:
+Dongbo Xi (xidongbo@meituan.com)
+'''
+import numpy as np
+import joblib
+import re
+import random
+random.seed(2020)
+np.random.seed(2020)
+data_path = 'data/sample_skeleton_{}.csv'
+common_feat_path = 'data/common_features_{}.csv'
+enum_path = 'data/ctrcvr_enum.pkl'
+write_path = 'data/ctr_cvr'
+use_columns = [
+    '101', '121', '122', '124', '125', '126', '127', '128', '129', '205',
+    '206', '207', '216', '508', '509', '702', '853', '301'
+]
+
+
+class process(object):
+    def __init__(self):
+        pass
+
+    def process_train(self):
+        c = 0
+        common_feat_dict = {}
+        with open(common_feat_path.format('train'), 'r') as fr:
+            for line in fr:
+                line_list = line.strip().split(',')
+                kv = np.array(re.split('\x01|\x02|\x03', line_list[2]))
+                key = kv[range(0, len(kv), 3)]
+                value = kv[range(1, len(kv), 3)]
+                feat_dict = dict(zip(key, value))
+                common_feat_dict[line_list[0]] = feat_dict
+                c += 1
+                if c % 100000 == 0:
+                    print(c)
+        print('join feats...')
+        c = 0
+        vocabulary = dict(
+            zip(use_columns, [{} for _ in range(len(use_columns))]))
+        with open(data_path.format('train') + '.tmp', 'w') as fw:
+            fw.write('click,purchase,' + ','.join(use_columns) + '\n')
+            with open(data_path.format('train'), 'r') as fr:
+                for line in fr:
+                    line_list = line.strip().split(',')
+                    if line_list[1] == '0' and line_list[2] == '1':
+                        continue
+                    kv = np.array(re.split('\x01|\x02|\x03', line_list[5]))
+                    key = kv[range(0, len(kv), 3)]
+                    value = kv[range(1, len(kv), 3)]
+                    feat_dict = dict(zip(key, value))
+                    feat_dict.update(common_feat_dict[line_list[3]])
+                    feats = line_list[1:3]
+                    for k in use_columns:
+                        feats.append(feat_dict.get(k, '0'))
+                    fw.write(','.join(feats) + '\n')
+                    for k, v in feat_dict.items():
+                        if k in use_columns:
+                            if v in vocabulary[k]:
+                                vocabulary[k][v] += 1
+                            else:
+                                vocabulary[k][v] = 0
+                    c += 1
+                    if c % 100000 == 0:
+                        print(c)
+        print('before filter low freq:')
+        for k, v in vocabulary.items():
+            print(k + ':' + str(len(v)))
+        new_vocabulary = dict(
+            zip(use_columns, [set() for _ in range(len(use_columns))]))
+        for k, v in vocabulary.items():
+            for k1, v1 in v.items():
+                if v1 > 10:
+                    new_vocabulary[k].add(k1)
+        vocabulary = new_vocabulary
+        print('after filter low freq:')
+        for k, v in vocabulary.items():
+            print(k + ':' + str(len(v)))
+        joblib.dump(vocabulary, enum_path, compress=3)
+
+        print('encode feats...')
+        vocabulary = joblib.load(enum_path)
+        feat_map = {}
+        for feat in use_columns:
+            feat_map[feat] = dict(
+                zip(vocabulary[feat], range(1, len(vocabulary[feat]) + 1)))
+        c = 0
+        with open(write_path + '.train', 'w') as fw1:
+            with open(write_path + '.dev', 'w') as fw2:
+                fw1.write('click,purchase,' + ','.join(use_columns) + '\n')
+                fw2.write('click,purchase,' + ','.join(use_columns) + '\n')
+                with open(data_path.format('train') + '.tmp', 'r') as fr:
+                    fr.readline()  # remove header
+                    for line in fr:
+                        line_list = line.strip().split(',')
+                        new_line = line_list[:2]
+                        for value, feat in zip(line_list[2:], use_columns):
+                            new_line.append(
+                                str(feat_map[feat].get(value, '0')))
+                        if random.random() >= 0.9:
+                            fw2.write(','.join(new_line) + '\n')
+                        else:
+                            fw1.write(','.join(new_line) + '\n')
+                        c += 1
+                        if c % 100000 == 0:
+                            print(c)
+
+    def process_test(self):
+        c = 0
+        common_feat_dict = {}
+        with open(common_feat_path.format('test'), 'r') as fr:
+            for line in fr:
+                line_list = line.strip().split(',')
+                kv = np.array(re.split('\x01|\x02|\x03', line_list[2]))
+                key = kv[range(0, len(kv), 3)]
+                value = kv[range(1, len(kv), 3)]
+                feat_dict = dict(zip(key, value))
+                common_feat_dict[line_list[0]] = feat_dict
+                c += 1
+                if c % 100000 == 0:
+                    print(c)
+        print('join feats...')
+        c = 0
+        with open(data_path.format('test') + '.tmp', 'w') as fw:
+            fw.write('click,purchase,' + ','.join(use_columns) + '\n')
+            with open(data_path.format('test'), 'r') as fr:
+                for line in fr:
+                    line_list = line.strip().split(',')
+                    if line_list[1] == '0' and line_list[2] == '1':
+                        continue
+                    kv = np.array(re.split('\x01|\x02|\x03', line_list[5]))
+                    key = kv[range(0, len(kv), 3)]
+                    value = kv[range(1, len(kv), 3)]
+                    feat_dict = dict(zip(key, value))
+                    feat_dict.update(common_feat_dict[line_list[3]])
+                    feats = line_list[1:3]
+                    for k in use_columns:
+                        feats.append(str(feat_dict.get(k, '0')))
+                    fw.write(','.join(feats) + '\n')
+                    c += 1
+                    if c % 100000 == 0:
+                        print(c)
+
+        print('encode feats...')
+        vocabulary = joblib.load(enum_path)
+        feat_map = {}
+        for feat in use_columns:
+            feat_map[feat] = dict(
+                zip(vocabulary[feat], range(1, len(vocabulary[feat]) + 1)))
+        c = 0
+        with open(write_path + '.test', 'w') as fw:
+            fw.write('click,purchase,' + ','.join(use_columns) + '\n')
+            with open(data_path.format('test') + '.tmp', 'r') as fr:
+                fr.readline()  # remove header
+                for line in fr:
+                    line_list = line.strip().split(',')
+                    new_line = line_list[:2]
+                    for value, feat in zip(line_list[2:], use_columns):
+                        new_line.append(str(feat_map[feat].get(value, '0')))
+                    fw.write(','.join(new_line) + '\n')
+                    c += 1
+                    if c % 100000 == 0:
+                        print(c)
+
+
+if __name__ == "__main__":
+    pros = process()
+    pros.process_train()
+    pros.process_test()
diff --git a/datasets/ali-cpp_aitm/run.sh b/datasets/ali-cpp_aitm/run.sh
@@ -0,0 +1,22 @@
+mkdir data
+mkdir data/whole_data && mkdir data/whole_data/train && mkdir data/whole_data/test
+train_source_path="./data/sample_train.tar.gz"
+train_target_path="train_data"
+test_source_path="./data/sample_test.tar.gz"
+test_target_path="test_data"
+cd data
+echo "downloading sample_train.tar.gz......"
+curl -# 'http://jupter-oss.oss-cn-hangzhou.aliyuncs.com/file/opensearch/documents/408/sample_train.tar.gz?Expires=1586435769&OSSAccessKeyId=LTAIGx40tjZWxj6q&Signature=ahUDqhvKT1cGjC4%2FIER2EWtq7o4%3D&response-content-disposition=attachment%3B%20' -H 'Proxy-Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' -H 'Accept-Language: zh-CN,zh;q=0.9' --compressed --insecure -o sample_train.tar.gz
+cd ..
+echo "unzipping sample_train.tar.gz......"
+tar -xzvf  ${train_source_path} -C data && rm -rf ${train_source_path}
+cd data
+echo "downloading sample_test.tar.gz......"
+curl -# 'http://jupter-oss.oss-cn-hangzhou.aliyuncs.com/file/opensearch/documents/408/sample_test.tar.gz?Expires=1586435821&OSSAccessKeyId=LTAIGx40tjZWxj6q&Signature=OwLMPjt1agByQtRVi8pazsAliNk%3D&response-content-disposition=attachment%3B%20' -H 'Proxy-Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' -H 'Accept-Language: zh-CN,zh;q=0.9' --compressed --insecure -o sample_test.tar.gz
+cd ..
+echo "unzipping sample_test.tar.gz......"
+tar -xzvf  ${test_source_path} -C data && rm -rf ${test_source_path}
+echo "preprocessing data......"
+python process_public_data.py
+mv data/ctr_cvr.train data/whole_data/train
+mv data/ctr_cvr.test data/whole_data/test
diff --git a/doc/source/models/rank/aitm.md b/doc/source/models/rank/aitm.md
@@ -0,0 +1,66 @@
+# AITM模型的点击率预估模型
+
+代码请参考：[AITM](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/aitm)  
+如果我们的代码对您有用，还请点个star啊~  
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [运行环境](#运行环境)
+- [快速开始](#快速开始)
+- [效果复现](#效果复现)
+- [进阶使用](#进阶使用)
+- [FAQ](#FAQ)
+
+## 模型简介
+在推荐场景里，用户的转化链路往往有多个中间步骤（曝光->点击->转化），而有些行业转化链路很长，如金融-信用卡业务，它包括曝光->点击->表单（application）->信用核准（approval）->信用卡激活（activation）。处于链路后端的节点（如approval/activation），因为转化时间久，获取难度较大，导致转化数据少，训练时类别不平衡的问题很严重。
+
+作者设计了一种多任务模型框架，充分利用了链路上各个节点的样本，提升模型对后端节点转化率的预估
+## 数据准备
+
+数据为[Ali-CCP click](https://tianchi.aliyun.com/datalab/dataSet.html?dataId=408)
+在模型目录的data目录下为您准备了快速运行的示例数据，若需要使用全量数据可以参考下方[效果复现](#效果复现)部分。
+
+## 运行环境
+PaddlePaddle>=2.0
+
+python 2.7/3.5/3.6/3.7
+
+os : windows/linux/macos 
+
+## 快速开始
+本文提供了样例数据可以供您快速体验，在任意目录下均可执行。在aitm模型目录的快速执行命令如下： 
+```bash
+# 进入模型目录
+# cd models/rank/aitm # 在任意目录均可运行
+# 动态图训练
+python -u ../../../tools/trainer.py -m config.yaml
+
+# 动态图预测
+python -u ../../../tools/infer.py -m config.yaml
+``` 
+## 效果复现
+为了方便使用者能够快速的跑通每一个模型，我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
+在全量数据下模型的指标如下：  
+| 模型 | click auc | purchase auc |batch_size | epoch_num| Time of each epoch |
+| :------| :------ | :------ | :------ | :------| :------ | 
+| aitm | 0.6186 |0.6525 | 2000 | 6| 约3小时 |
+
+1. 确认您当前所在目录为PaddleRec/models/rank/aitm
+2. 进入Paddlerec/datasets/ali-cpp_aitm
+3. 执行命令运行全量数据
+
+``` bash
+cd ../../../datasets/ali-cpp_aitm
+sh run.sh
+```
+```bash
+cd - # 切回模型目录
+# 动态图训练
+python -u ../../../tools/trainer.py -m config_bigdata.yaml 
+python -u ../../../tools/infer.py -m config_bigdata.yaml
+```
+## 进阶使用
+
+## FAQ
diff --git a/doc/source/readme.md b/doc/source/readme.md
@@ -48,3 +48,4 @@
 [fat_deepffm](https://paddlerec.readthedocs.io/en/latest/models/rank/fat_deepffm.html)  
 [deeprec](https://paddlerec.readthedocs.io/en/latest/models/rank/deeprec.html)  
 [autofis](https://paddlerec.readthedocs.io/en/latest/models/rank/autofis.html)  
+[aitm](https://paddlerec.readthedocs.io/en/latest/models/rank/aitm.html)  
diff --git a/models/rank/aitm/__init__.py b/models/rank/aitm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/models/rank/aitm/aitm_reader.py b/models/rank/aitm/aitm_reader.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import paddle
+from paddle.io import Dataset
+
+
+class RecDataset(Dataset):
+    def __init__(self, file_list, config):
+        super(RecDataset, self).__init__()
+        self.feature_names = []
+        self.datafile = file_list[0]
+        self.data = []
+        self._load_data()
+
+    def _load_data(self):
+        print("start load data from: {}".format(self.datafile))
+        count = 0
+        with open(self.datafile) as f:
+            self.feature_names = f.readline().strip().split(',')[2:]
+            for line in f:
+                count += 1
+                line = line.strip().split(',')
+                line = [int(v) for v in line]
+                self.data.append(line)
+        print("load data from {} finished".format(self.datafile))
+
+    def __len__(self, ):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        line = self.data[idx]
+        click = line[0]
+        conversion = line[1]
+        # features = dict(zip(self.feature_names, line[2:]))
+        features = line[2:]
+        return click, conversion, features