Skip to content

Commit

Permalink
Fix field name (#403)
Browse files Browse the repository at this point in the history
* dcn Label -> label

* Label -> label

* update dlrm readme
  • Loading branch information
ShawnXuan authored Nov 1, 2022
1 parent ee3e007 commit fc7cbf8
Show file tree
Hide file tree
Showing 15 changed files with 20 additions and 19 deletions.
2 changes: 1 addition & 1 deletion RecommenderSystems/dcn/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ python3 -m pip install -r requirements.txt
The Criteo dataset is from [2014-kaggle-display-advertising-challenge-dataset](https://www.kaggle.com/competitions/criteo-display-ad-challenge/overview), considered the original download link is invalid, click [here](https://www.kaggle.com/datasets/mrkmakr/criteo-dataset) to donwload if you would.

Each sample contains:
- Label - Target variable that indicates if an ad was clicked (1) or not (0).
- label - Target variable that indicates if an ad was clicked (1) or not (0).
- I1-I13 - A total of 13 columns of integer features (mostly count features).
- C1-C26 - A total of 26 columns of categorical features. The values of these features have been hashed onto 32 bits for anonymization purposes.

Expand Down
2 changes: 1 addition & 1 deletion RecommenderSystems/dcn/dcn_eager_train_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def __init__(
self.shard_count = shard_count
self.cur_shard = cur_shard

fields = ["Label"]
fields = ["label"]
fields += [f"I{i+1}" for i in range(num_dense_fields)]
fields += [f"C{i+1}" for i in range(num_sparse_fields)]
self.fields = fields
Expand Down
2 changes: 1 addition & 1 deletion RecommenderSystems/dcn/dcn_train_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def __init__(
self.shard_count = shard_count
self.cur_shard = cur_shard

fields = ["Label"]
fields = ["label"]
fields += [f"I{i+1}" for i in range(num_dense_fields)]
fields += [f"C{i+1}" for i in range(num_sparse_fields)]
self.fields = fields
Expand Down
2 changes: 1 addition & 1 deletion RecommenderSystems/dcn/tools/dcn_parquet.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def makeDCNDataset(srcDir: String, dstDir:String) = {
val val_csv = s"${srcDir}/valid.csv"

val make_label = udf((str:String) => str.toFloat)
val label_cols = Seq(make_label($"Label").as("Label"))
val label_cols = Seq(make_label($"label").as("label"))

val dense_cols = 1.to(13).map{i=>xxhash64(lit(i), col(s"I$i")).as(s"I${i}")}

Expand Down
4 changes: 2 additions & 2 deletions RecommenderSystems/dcn/tools/split_criteo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


RANDOM_SEED = 2018
cols = ["Label"]
cols = ["label"]
for i in range(1, 14):
cols.append("I" + str(i))
for i in range(1, 27):
Expand All @@ -24,7 +24,7 @@ def split_criteo(args):
dtype=object,
)
X = ddf.values
y = ddf["Label"].map(lambda x: float(x)).values
y = ddf["label"].map(lambda x: float(x)).values
print(str(len(X)) + " lines in total")

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED).split(
Expand Down
2 changes: 1 addition & 1 deletion RecommenderSystems/deepfm/deepfm_train_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def __init__(
self.shard_count = shard_count
self.cur_shard = cur_shard

fields = ["Label"]
fields = ["label"]
fields += [f"I{i+1}" for i in range(num_dense_fields)]
fields += [f"C{i+1}" for i in range(num_sparse_fields)]
self.fields = fields
Expand Down
2 changes: 1 addition & 1 deletion RecommenderSystems/deepfm/tools/deepfm_parquet.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def makeDeepfmDataset(srcDir: String, dstDir:String) = {
val val_csv = s"${srcDir}/valid.csv"

val make_label = udf((str:String) => str.toFloat)
val label_cols = Seq(make_label($"Label").as("Label"))
val label_cols = Seq(make_label($"label").as("label"))

val dense_cols = 1.to(13).map{i=>xxhash64(lit(i), col(s"I$i")).as(s"I${i}")}

Expand Down
4 changes: 2 additions & 2 deletions RecommenderSystems/deepfm/tools/split_criteo_kaggle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def split_train_val_test(input_dir, output_dir):
num_dense_fields = 13
num_sparse_fields = 26

fields = ["Label"]
fields = ["label"]
fields += [f"I{i+1}" for i in range(num_dense_fields)]
fields += [f"C{i+1}" for i in range(num_sparse_fields)]

Expand All @@ -23,7 +23,7 @@ def split_train_val_test(input_dir, output_dir):
dtype=object,
)
X = ddf.values
y = ddf["Label"].map(lambda x: float(x)).values
y = ddf["label"].map(lambda x: float(x)).values
print(f"{len(X)} samples in total")

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
Expand Down
1 change: 1 addition & 0 deletions RecommenderSystems/dlrm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ In `/path/to/dlrm_parquet`, move all `parquet` files in folder `shuffled_day_par
$ mkdir train
$ mv ./shuffled_day_parts/day_part_*/*.parquet train/.
```
Note: in `criteo1t_parquet_day_by_day.scala`, date type of categorical columns C1-C26 is `int32`.

## Start training by Oneflow
Following command will launch 8 oneflow dlrm training and evaluation processes on a node with 8 GPU devices, by specify `data_dir` for data input and `persistent_path` for OneEmbedding persistent store path.
Expand Down
2 changes: 1 addition & 1 deletion RecommenderSystems/pnn/pnn_train_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def __init__(
self.shard_count = shard_count
self.cur_shard = cur_shard

fields = ["Label"]
fields = ["label"]
fields += [f"I{i+1}" for i in range(num_dense_fields)]
fields += [f"C{i+1}" for i in range(num_sparse_fields)]
self.fields = fields
Expand Down
4 changes: 2 additions & 2 deletions RecommenderSystems/pnn/tools/criteo_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ def make_pnn_parquet(
):
sparse_names = [f"C{i}" for i in range(1, 27)]
dense_names = [f"I{i}" for i in range(1, 14)]
column_names = ["Label"] + dense_names + sparse_names
column_names = ["label"] + dense_names + sparse_names

make_label = udf(lambda s: int(s), IntegerType())
label_col = make_label("Label").alias("Label")
label_col = make_label("label").alias("label")

if mod_idx <= 0:
dense_cols = [
Expand Down
4 changes: 2 additions & 2 deletions RecommenderSystems/pnn/tools/split_criteo_kaggle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def split_train_val_test(input_dir, output_dir):
num_dense_fields = 13
num_sparse_fields = 26

fields = ["Label"]
fields = ["label"]
fields += [f"I{i+1}" for i in range(num_dense_fields)]
fields += [f"C{i+1}" for i in range(num_sparse_fields)]

Expand All @@ -23,7 +23,7 @@ def split_train_val_test(input_dir, output_dir):
dtype=object,
)
X = ddf.values
y = ddf["Label"].map(lambda x: float(x)).values
y = ddf["label"].map(lambda x: float(x)).values
print(f"{len(X)} samples in total")

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
Expand Down
4 changes: 2 additions & 2 deletions RecommenderSystems/xdeepfm/tools/split_criteo_kaggle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def split_train_val_test(input_dir, output_dir):
num_dense_fields = 13
num_sparse_fields = 26

fields = ["Label"]
fields = ["label"]
fields += [f"I{i+1}" for i in range(num_dense_fields)]
fields += [f"C{i+1}" for i in range(num_sparse_fields)]

Expand All @@ -23,7 +23,7 @@ def split_train_val_test(input_dir, output_dir):
dtype=object,
)
X = ddf.values
y = ddf["Label"].map(lambda x: float(x)).values
y = ddf["label"].map(lambda x: float(x)).values
print(f"{len(X)} samples in total")

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
Expand Down
2 changes: 1 addition & 1 deletion RecommenderSystems/xdeepfm/tools/xdeepfm_parquet.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def makexDeepfmDataset(srcDir: String, dstDir:String) = {
val val_csv = s"${srcDir}/valid.csv"

val make_label = udf((str:String) => str.toFloat)
val label_cols = Seq(make_label($"Label").as("Label"))
val label_cols = Seq(make_label($"label").as("label"))

val dense_cols = 1.to(13).map{i=>xxhash64(lit(i), col(s"I$i")).as(s"I${i}")}

Expand Down
2 changes: 1 addition & 1 deletion RecommenderSystems/xdeepfm/xdeepfm_train_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def __init__(
self.shard_count = shard_count
self.cur_shard = cur_shard

fields = ["Label"]
fields = ["label"]
fields += [f"I{i+1}" for i in range(num_dense_fields)]
fields += [f"C{i+1}" for i in range(num_sparse_fields)]
self.fields = fields
Expand Down

0 comments on commit fc7cbf8

Please sign in to comment.