Skip to content

Commit

Permalink
Merge pull request #830 from kafaichan/master
Browse files Browse the repository at this point in the history
add model autoint
  • Loading branch information
frankwhzhang authored Sep 6, 2022
2 parents c7dafb0 + 515024a commit a8ec5f9
Show file tree
Hide file tree
Showing 29 changed files with 1,447 additions and 1 deletion.
2 changes: 1 addition & 1 deletion contributor.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,5 @@
| [AITM](models/multitask/aitm/) | [renmada](https://github.com/renmada) | https://github.com/PaddlePaddle/PaddleRec/pull/756 | 论文复现赛第六期 |
| [IPRec](models/rank/iprec/) | [renmada](https://github.com/renmada) | https://github.com/PaddlePaddle/PaddleRec/pull/774 | 论文复现赛第六期 |
| [KIM](models/match/kim/) | [renmada](https://github.com/renmada) | https://github.com/PaddlePaddle/PaddleRec/pull/790 | 论文复现赛第六期 |

| [AutoInt](models/rank/autoint/) | [kafaichan](https://github.com/kafaichan) | https://github.com/PaddlePaddle/PaddleRec/pull/830 | 论文复现赛第七期 |
</div>
57 changes: 57 additions & 0 deletions datasets/criteo_autoint/convert2txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import numpy as np
import argparse

parser = argparse.ArgumentParser('convert npy to txt file')
parser.add_argument(
'--root_dir',
type=str,
default='./Criteo',
required=False,
help='root directory of src data')
args = parser.parse_args()


def write_to_file(output_folder, file_path_list):
fmt_str = ['%d'] + ['%d'] * 39 + ['%.7f'] * 39
for folder in file_path_list:
if not os.path.isdir(folder): continue
print("begin {}".format(folder))
feature_index = np.load(
os.path.join(folder, 'train_i.npy'),
allow_pickle=True).astype('int64')
feature_value = np.load(
os.path.join(folder, 'train_x2.npy'),
allow_pickle=True).astype('float32')
label = np.load(
os.path.join(folder, 'train_y.npy'),
allow_pickle=True).astype('int64').reshape([-1, 1])
data = np.concatenate((label, feature_index, feature_value), axis=1)
np.savetxt(
os.path.join(output_folder, os.path.basename(folder)),
data,
fmt=' '.join(fmt_str))
print("complete {}".format(folder))


if __name__ == '__main__':
train_folders = [
os.path.join(args.root_dir, 'part{}'.format(i)) for i in range(3, 11)
]
test_folders = [os.path.join(args.root_dir, 'part1')]
write_to_file('./slot_test_data_full', test_folders)
write_to_file('./slot_train_data_full', train_folders)
11 changes: 11 additions & 0 deletions datasets/criteo_autoint/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz

tar -zxvf ctr_data.tar.gz

mkdir ./tmp
mv raw_data tmp
mv test_data tmp

find ./tmp -type f -name 'part*' -exec cat {} \; > criteo.data
rm -rf ./tmp
echo "Complete data download."
125 changes: 125 additions & 0 deletions datasets/criteo_autoint/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#Copyright (c) 2018 Chence Shi

#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:

#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.

#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.

import argparse
import os

parser = argparse.ArgumentParser(description='criteo dataset preprocesser')
parser.add_argument(
'--source_data',
type=str,
required=True,
default='./criteo.txt',
help='source path')
parser.add_argument(
'--output_path',
type=str,
required=True,
default='./Criteo',
help='output path')
args = parser.parse_args()

if not os.path.exists(args.output_path):
os.mkdir(args.output_path)

train_path = args.source_data
f1 = open(train_path, 'r')
dic = {}
# generate three fold.
# train_x: value
# train_i: index
# train_y: label
f_train_value = open(os.path.join(args.output_path, 'train_x.txt'), 'w')
f_train_index = open(os.path.join(args.output_path, 'train_i.txt'), 'w')
f_train_label = open(os.path.join(args.output_path, 'train_y.txt'), 'w')

for i in range(39):
dic[i] = {}

cnt_train = 0

#for debug
#limits = 10000
index = [1] * 26
for line in f1:
cnt_train += 1
if cnt_train % 100000 == 0:
print('now train cnt : %d\n' % cnt_train)
#if cnt_train > limits:
# break
split = line.strip('\n').split('\t')
# 0-label, 1-13 numerical, 14-39 category
for i in range(13, 39):
#dic_len = len(dic[i])
if split[i + 1] not in dic[i]:
# [1, 0] 1 is the index for those whose appear times <= 10 0 indicates the appear times
dic[i][split[i + 1]] = [1, 0]
dic[i][split[i + 1]][1] += 1
if dic[i][split[i + 1]][0] == 1 and dic[i][split[i + 1]][1] > 10:
index[i - 13] += 1
dic[i][split[i + 1]][0] = index[i - 13]
f1.close()
print('total entries :%d\n' % (cnt_train - 1))

# calculate number of category features of every dimension
kinds = [13]
for i in range(13, 39):
kinds.append(index[i - 13])
print('number of dimensions : %d' % (len(kinds) - 1))
print(kinds)

for i in range(1, len(kinds)):
kinds[i] += kinds[i - 1]
print(kinds)

# make new data

f1 = open(train_path, 'r')
cnt_train = 0
print('remake training data...\n')
for line in f1:
cnt_train += 1
if cnt_train % 100000 == 0:
print('now train cnt : %d\n' % cnt_train)
#if cnt_train > limits:
# break
entry = ['0'] * 39
index = [None] * 39
split = line.strip('\n').split('\t')
label = str(split[0])
for i in range(13):
if split[i + 1] != '':
entry[i] = (split[i + 1])
index[i] = (i + 1)
for i in range(13, 39):
if split[i + 1] != '':
entry[i] = '1'
index[i] = (dic[i][split[i + 1]][0])
for j in range(26):
index[13 + j] += kinds[j]
index = [str(item) for item in index]
f_train_value.write(' '.join(entry) + '\n')
f_train_index.write(' '.join(index) + '\n')
f_train_label.write(label + '\n')
f1.close()

f_train_value.close()
f_train_index.close()
f_train_label.close()
8 changes: 8 additions & 0 deletions datasets/criteo_autoint/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
sh download.sh
mkdir slot_train_data_full
mkdir slot_test_data_full

python preprocess.py --source_data ./criteo.data --output_path=./Criteo
python stratifiedKfold.py
python scale.py
python convert2txt.py
58 changes: 58 additions & 0 deletions datasets/criteo_autoint/scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#Copyright (c) 2018 Chence Shi

#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:

#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.

#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.

import math
import numpy as np
import argparse
import os

parser = argparse.ArgumentParser(description='criteo dataset scale')
parser.add_argument(
'--src_path',
type=str,
required=False,
default='./Criteo',
help='source path')
args = parser.parse_args()


def scale(x):
if x > 2:
x = int(math.log(float(x))**2)
return x


def scale_each_fold():
for i in range(1, 11):
print('now part %d' % i)
data = np.load(
os.path.join(args.src_path, 'part' + str(i), 'train_x.npy'),
allow_pickle=True)
part = data[:, 0:13]
for j in range(part.shape[0]):
if j % 100000 == 0:
print(j)
part[j] = list(map(scale, part[j]))
np.save(
os.path.join(args.src_path, 'part' + str(i), 'train_x2.npy'), data)


if __name__ == '__main__':
scale_each_fold()
Loading

0 comments on commit a8ec5f9

Please sign in to comment.