Merge pull request #830 from kafaichan/master

add model autoint
PaddlePaddle · Sep 6, 2022 · a8ec5f9 · a8ec5f9
2 parents c7dafb0 + 515024a
commit a8ec5f9
Show file tree

Hide file tree

Showing 29 changed files with 1,447 additions and 1 deletion.
diff --git a/contributor.md b/contributor.md
@@ -27,5 +27,5 @@
   |                     [AITM](models/multitask/aitm/)                     |  [renmada](https://github.com/renmada)  |    https://github.com/PaddlePaddle/PaddleRec/pull/756   | 论文复现赛第六期 |
   |                     [IPRec](models/rank/iprec/)                     |  [renmada](https://github.com/renmada)  |    https://github.com/PaddlePaddle/PaddleRec/pull/774   | 论文复现赛第六期 |
   |                     [KIM](models/match/kim/)                     |  [renmada](https://github.com/renmada)  |    https://github.com/PaddlePaddle/PaddleRec/pull/790   | 论文复现赛第六期 |
-
+  |                     [AutoInt](models/rank/autoint/)                     |  [kafaichan](https://github.com/kafaichan)  |    https://github.com/PaddlePaddle/PaddleRec/pull/830  | 论文复现赛第七期 |
 </div> 
diff --git a/datasets/criteo_autoint/convert2txt.py b/datasets/criteo_autoint/convert2txt.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import argparse
+
+parser = argparse.ArgumentParser('convert npy to txt file')
+parser.add_argument(
+    '--root_dir',
+    type=str,
+    default='./Criteo',
+    required=False,
+    help='root directory of src data')
+args = parser.parse_args()
+
+
+def write_to_file(output_folder, file_path_list):
+    fmt_str = ['%d'] + ['%d'] * 39 + ['%.7f'] * 39
+    for folder in file_path_list:
+        if not os.path.isdir(folder): continue
+        print("begin {}".format(folder))
+        feature_index = np.load(
+            os.path.join(folder, 'train_i.npy'),
+            allow_pickle=True).astype('int64')
+        feature_value = np.load(
+            os.path.join(folder, 'train_x2.npy'),
+            allow_pickle=True).astype('float32')
+        label = np.load(
+            os.path.join(folder, 'train_y.npy'),
+            allow_pickle=True).astype('int64').reshape([-1, 1])
+        data = np.concatenate((label, feature_index, feature_value), axis=1)
+        np.savetxt(
+            os.path.join(output_folder, os.path.basename(folder)),
+            data,
+            fmt=' '.join(fmt_str))
+        print("complete {}".format(folder))
+
+
+if __name__ == '__main__':
+    train_folders = [
+        os.path.join(args.root_dir, 'part{}'.format(i)) for i in range(3, 11)
+    ]
+    test_folders = [os.path.join(args.root_dir, 'part1')]
+    write_to_file('./slot_test_data_full', test_folders)
+    write_to_file('./slot_train_data_full', train_folders)
diff --git a/datasets/criteo_autoint/download.sh b/datasets/criteo_autoint/download.sh
@@ -0,0 +1,11 @@
+wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz
+
+tar -zxvf ctr_data.tar.gz
+
+mkdir ./tmp
+mv raw_data tmp
+mv test_data tmp
+
+find ./tmp -type f -name 'part*' -exec cat {} \; > criteo.data
+rm -rf ./tmp
+echo "Complete data download."
diff --git a/datasets/criteo_autoint/preprocess.py b/datasets/criteo_autoint/preprocess.py
@@ -0,0 +1,125 @@
+#Copyright (c) 2018 Chence Shi
+
+#Permission is hereby granted, free of charge, to any person obtaining a copy
+#of this software and associated documentation files (the "Software"), to deal
+#in the Software without restriction, including without limitation the rights
+#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#copies of the Software, and to permit persons to whom the Software is
+#furnished to do so, subject to the following conditions:
+
+#The above copyright notice and this permission notice shall be included in all
+#copies or substantial portions of the Software.
+
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#SOFTWARE.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description='criteo dataset preprocesser')
+parser.add_argument(
+    '--source_data',
+    type=str,
+    required=True,
+    default='./criteo.txt',
+    help='source path')
+parser.add_argument(
+    '--output_path',
+    type=str,
+    required=True,
+    default='./Criteo',
+    help='output path')
+args = parser.parse_args()
+
+if not os.path.exists(args.output_path):
+    os.mkdir(args.output_path)
+
+train_path = args.source_data
+f1 = open(train_path, 'r')
+dic = {}
+# generate three fold.
+# train_x: value
+# train_i: index
+# train_y: label
+f_train_value = open(os.path.join(args.output_path, 'train_x.txt'), 'w')
+f_train_index = open(os.path.join(args.output_path, 'train_i.txt'), 'w')
+f_train_label = open(os.path.join(args.output_path, 'train_y.txt'), 'w')
+
+for i in range(39):
+    dic[i] = {}
+
+cnt_train = 0
+
+#for debug
+#limits = 10000
+index = [1] * 26
+for line in f1:
+    cnt_train += 1
+    if cnt_train % 100000 == 0:
+        print('now train cnt : %d\n' % cnt_train)
+    #if cnt_train > limits:
+    #	break
+    split = line.strip('\n').split('\t')
+    # 0-label, 1-13 numerical, 14-39 category 
+    for i in range(13, 39):
+        #dic_len = len(dic[i])
+        if split[i + 1] not in dic[i]:
+            # [1, 0] 1 is the index for those whose appear times <= 10   0 indicates the appear times
+            dic[i][split[i + 1]] = [1, 0]
+        dic[i][split[i + 1]][1] += 1
+        if dic[i][split[i + 1]][0] == 1 and dic[i][split[i + 1]][1] > 10:
+            index[i - 13] += 1
+            dic[i][split[i + 1]][0] = index[i - 13]
+f1.close()
+print('total entries :%d\n' % (cnt_train - 1))
+
+# calculate number of category features of every dimension
+kinds = [13]
+for i in range(13, 39):
+    kinds.append(index[i - 13])
+print('number of dimensions : %d' % (len(kinds) - 1))
+print(kinds)
+
+for i in range(1, len(kinds)):
+    kinds[i] += kinds[i - 1]
+print(kinds)
+
+# make new data
+
+f1 = open(train_path, 'r')
+cnt_train = 0
+print('remake training data...\n')
+for line in f1:
+    cnt_train += 1
+    if cnt_train % 100000 == 0:
+        print('now train cnt : %d\n' % cnt_train)
+    #if cnt_train > limits:
+    #	break
+    entry = ['0'] * 39
+    index = [None] * 39
+    split = line.strip('\n').split('\t')
+    label = str(split[0])
+    for i in range(13):
+        if split[i + 1] != '':
+            entry[i] = (split[i + 1])
+        index[i] = (i + 1)
+    for i in range(13, 39):
+        if split[i + 1] != '':
+            entry[i] = '1'
+        index[i] = (dic[i][split[i + 1]][0])
+    for j in range(26):
+        index[13 + j] += kinds[j]
+    index = [str(item) for item in index]
+    f_train_value.write(' '.join(entry) + '\n')
+    f_train_index.write(' '.join(index) + '\n')
+    f_train_label.write(label + '\n')
+f1.close()
+
+f_train_value.close()
+f_train_index.close()
+f_train_label.close()
diff --git a/datasets/criteo_autoint/run.sh b/datasets/criteo_autoint/run.sh
@@ -0,0 +1,8 @@
+sh download.sh
+mkdir slot_train_data_full
+mkdir slot_test_data_full
+
+python preprocess.py --source_data ./criteo.data --output_path=./Criteo
+python stratifiedKfold.py
+python scale.py
+python convert2txt.py
diff --git a/datasets/criteo_autoint/scale.py b/datasets/criteo_autoint/scale.py
@@ -0,0 +1,58 @@
+#Copyright (c) 2018 Chence Shi
+
+#Permission is hereby granted, free of charge, to any person obtaining a copy
+#of this software and associated documentation files (the "Software"), to deal
+#in the Software without restriction, including without limitation the rights
+#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#copies of the Software, and to permit persons to whom the Software is
+#furnished to do so, subject to the following conditions:
+
+#The above copyright notice and this permission notice shall be included in all
+#copies or substantial portions of the Software.
+
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#SOFTWARE.
+
+import math
+import numpy as np
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description='criteo dataset scale')
+parser.add_argument(
+    '--src_path',
+    type=str,
+    required=False,
+    default='./Criteo',
+    help='source path')
+args = parser.parse_args()
+
+
+def scale(x):
+    if x > 2:
+        x = int(math.log(float(x))**2)
+    return x
+
+
+def scale_each_fold():
+    for i in range(1, 11):
+        print('now part %d' % i)
+        data = np.load(
+            os.path.join(args.src_path, 'part' + str(i), 'train_x.npy'),
+            allow_pickle=True)
+        part = data[:, 0:13]
+        for j in range(part.shape[0]):
+            if j % 100000 == 0:
+                print(j)
+            part[j] = list(map(scale, part[j]))
+        np.save(
+            os.path.join(args.src_path, 'part' + str(i), 'train_x2.npy'), data)
+
+
+if __name__ == '__main__':
+    scale_each_fold()