Fix field name (#403)

* dcn Label -> label * Label -> label * update dlrm readme
Oneflow-Inc · Nov 1, 2022 · fc7cbf8 · fc7cbf8
1 parent ee3e007
commit fc7cbf8
Show file tree

Hide file tree

Showing 15 changed files with 20 additions and 19 deletions.
diff --git a/RecommenderSystems/dcn/README.md b/RecommenderSystems/dcn/README.md
@@ -90,7 +90,7 @@ python3 -m pip install -r requirements.txt
 The Criteo dataset is from [2014-kaggle-display-advertising-challenge-dataset](https://www.kaggle.com/competitions/criteo-display-ad-challenge/overview), considered the original download link is invalid, click [here](https://www.kaggle.com/datasets/mrkmakr/criteo-dataset) to donwload if you would.
 
 Each sample contains:
-- Label - Target variable that indicates if an ad was clicked (1) or not (0).
+- label - Target variable that indicates if an ad was clicked (1) or not (0).
 - I1-I13 - A total of 13 columns of integer features (mostly count features).
 - C1-C26 - A total of 26 columns of categorical features. The values of these features have been hashed onto 32 bits for anonymization purposes.
 

diff --git a/RecommenderSystems/dcn/dcn_eager_train_eval.py b/RecommenderSystems/dcn/dcn_eager_train_eval.py
@@ -125,7 +125,7 @@ def __init__(
         self.shard_count = shard_count
         self.cur_shard = cur_shard
 
-        fields = ["Label"]
+        fields = ["label"]
         fields += [f"I{i+1}" for i in range(num_dense_fields)]
         fields += [f"C{i+1}" for i in range(num_sparse_fields)]
         self.fields = fields

diff --git a/RecommenderSystems/dcn/dcn_train_eval.py b/RecommenderSystems/dcn/dcn_train_eval.py
@@ -126,7 +126,7 @@ def __init__(
         self.shard_count = shard_count
         self.cur_shard = cur_shard
 
-        fields = ["Label"]
+        fields = ["label"]
         fields += [f"I{i+1}" for i in range(num_dense_fields)]
         fields += [f"C{i+1}" for i in range(num_sparse_fields)]
         self.fields = fields

diff --git a/RecommenderSystems/dcn/tools/dcn_parquet.scala b/RecommenderSystems/dcn/tools/dcn_parquet.scala
@@ -6,7 +6,7 @@ def makeDCNDataset(srcDir: String, dstDir:String) = {
     val val_csv = s"${srcDir}/valid.csv"
 
     val make_label = udf((str:String) => str.toFloat)
-    val label_cols = Seq(make_label($"Label").as("Label"))
+    val label_cols = Seq(make_label($"label").as("label"))
 
     val dense_cols = 1.to(13).map{i=>xxhash64(lit(i), col(s"I$i")).as(s"I${i}")}
 

diff --git a/RecommenderSystems/dcn/tools/split_criteo.py b/RecommenderSystems/dcn/tools/split_criteo.py
@@ -6,7 +6,7 @@
 
 
 RANDOM_SEED = 2018
-cols = ["Label"]
+cols = ["label"]
 for i in range(1, 14):
     cols.append("I" + str(i))
 for i in range(1, 27):
@@ -24,7 +24,7 @@ def split_criteo(args):
         dtype=object,
     )
     X = ddf.values
-    y = ddf["Label"].map(lambda x: float(x)).values
+    y = ddf["label"].map(lambda x: float(x)).values
     print(str(len(X)) + " lines in total")
 
     folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED).split(

diff --git a/RecommenderSystems/deepfm/deepfm_train_eval.py b/RecommenderSystems/deepfm/deepfm_train_eval.py
@@ -197,7 +197,7 @@ def __init__(
         self.shard_count = shard_count
         self.cur_shard = cur_shard
 
-        fields = ["Label"]
+        fields = ["label"]
         fields += [f"I{i+1}" for i in range(num_dense_fields)]
         fields += [f"C{i+1}" for i in range(num_sparse_fields)]
         self.fields = fields

diff --git a/RecommenderSystems/deepfm/tools/deepfm_parquet.scala b/RecommenderSystems/deepfm/tools/deepfm_parquet.scala
@@ -6,7 +6,7 @@ def makeDeepfmDataset(srcDir: String, dstDir:String) = {
     val val_csv = s"${srcDir}/valid.csv"
 
     val make_label = udf((str:String) => str.toFloat)
-    val label_cols = Seq(make_label($"Label").as("Label"))
+    val label_cols = Seq(make_label($"label").as("label"))
 
     val dense_cols = 1.to(13).map{i=>xxhash64(lit(i), col(s"I$i")).as(s"I${i}")}
 

diff --git a/RecommenderSystems/deepfm/tools/split_criteo_kaggle.py b/RecommenderSystems/deepfm/tools/split_criteo_kaggle.py
@@ -10,7 +10,7 @@ def split_train_val_test(input_dir, output_dir):
     num_dense_fields = 13
     num_sparse_fields = 26
 
-    fields = ["Label"]
+    fields = ["label"]
     fields += [f"I{i+1}" for i in range(num_dense_fields)]
     fields += [f"C{i+1}" for i in range(num_sparse_fields)]
 
@@ -23,7 +23,7 @@ def split_train_val_test(input_dir, output_dir):
         dtype=object,
     )
     X = ddf.values
-    y = ddf["Label"].map(lambda x: float(x)).values
+    y = ddf["label"].map(lambda x: float(x)).values
     print(f"{len(X)} samples in total")
 
     folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)

diff --git a/RecommenderSystems/dlrm/README.md b/RecommenderSystems/dlrm/README.md
@@ -106,6 +106,7 @@ In `/path/to/dlrm_parquet`, move all `parquet` files in folder `shuffled_day_par
 $ mkdir train
 $ mv ./shuffled_day_parts/day_part_*/*.parquet train/.
 ```
+Note: in `criteo1t_parquet_day_by_day.scala`, date type of categorical columns C1-C26 is `int32`.
 
 ## Start training by Oneflow
 Following command will launch 8 oneflow dlrm training and evaluation processes on a node with 8 GPU devices, by specify `data_dir` for data input and `persistent_path` for OneEmbedding persistent store path.

diff --git a/RecommenderSystems/pnn/pnn_train_eval.py b/RecommenderSystems/pnn/pnn_train_eval.py
@@ -177,7 +177,7 @@ def __init__(
         self.shard_count = shard_count
         self.cur_shard = cur_shard
 
-        fields = ["Label"]
+        fields = ["label"]
         fields += [f"I{i+1}" for i in range(num_dense_fields)]
         fields += [f"C{i+1}" for i in range(num_sparse_fields)]
         self.fields = fields

diff --git a/RecommenderSystems/pnn/tools/criteo_parquet.py b/RecommenderSystems/pnn/tools/criteo_parquet.py
@@ -27,10 +27,10 @@ def make_pnn_parquet(
 ):
     sparse_names = [f"C{i}" for i in range(1, 27)]
     dense_names = [f"I{i}" for i in range(1, 14)]
-    column_names = ["Label"] + dense_names + sparse_names
+    column_names = ["label"] + dense_names + sparse_names
 
     make_label = udf(lambda s: int(s), IntegerType())
-    label_col = make_label("Label").alias("Label")
+    label_col = make_label("label").alias("label")
 
     if mod_idx <= 0:
         dense_cols = [

diff --git a/RecommenderSystems/pnn/tools/split_criteo_kaggle.py b/RecommenderSystems/pnn/tools/split_criteo_kaggle.py
@@ -10,7 +10,7 @@ def split_train_val_test(input_dir, output_dir):
     num_dense_fields = 13
     num_sparse_fields = 26
 
-    fields = ["Label"]
+    fields = ["label"]
     fields += [f"I{i+1}" for i in range(num_dense_fields)]
     fields += [f"C{i+1}" for i in range(num_sparse_fields)]
 
@@ -23,7 +23,7 @@ def split_train_val_test(input_dir, output_dir):
         dtype=object,
     )
     X = ddf.values
-    y = ddf["Label"].map(lambda x: float(x)).values
+    y = ddf["label"].map(lambda x: float(x)).values
     print(f"{len(X)} samples in total")
 
     folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)

diff --git a/RecommenderSystems/xdeepfm/tools/split_criteo_kaggle.py b/RecommenderSystems/xdeepfm/tools/split_criteo_kaggle.py
@@ -10,7 +10,7 @@ def split_train_val_test(input_dir, output_dir):
     num_dense_fields = 13
     num_sparse_fields = 26
 
-    fields = ["Label"]
+    fields = ["label"]
     fields += [f"I{i+1}" for i in range(num_dense_fields)]
     fields += [f"C{i+1}" for i in range(num_sparse_fields)]
 
@@ -23,7 +23,7 @@ def split_train_val_test(input_dir, output_dir):
         dtype=object,
     )
     X = ddf.values
-    y = ddf["Label"].map(lambda x: float(x)).values
+    y = ddf["label"].map(lambda x: float(x)).values
     print(f"{len(X)} samples in total")
 
     folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)

diff --git a/RecommenderSystems/xdeepfm/tools/xdeepfm_parquet.scala b/RecommenderSystems/xdeepfm/tools/xdeepfm_parquet.scala
@@ -6,7 +6,7 @@ def makexDeepfmDataset(srcDir: String, dstDir:String) = {
     val val_csv = s"${srcDir}/valid.csv"
 
     val make_label = udf((str:String) => str.toFloat)
-    val label_cols = Seq(make_label($"Label").as("Label"))
+    val label_cols = Seq(make_label($"label").as("label"))
 
     val dense_cols = 1.to(13).map{i=>xxhash64(lit(i), col(s"I$i")).as(s"I${i}")}
 

diff --git a/RecommenderSystems/xdeepfm/xdeepfm_train_eval.py b/RecommenderSystems/xdeepfm/xdeepfm_train_eval.py
@@ -194,7 +194,7 @@ def __init__(
         self.shard_count = shard_count
         self.cur_shard = cur_shard
 
-        fields = ["Label"]
+        fields = ["label"]
         fields += [f"I{i+1}" for i in range(num_dense_fields)]
         fields += [f"C{i+1}" for i in range(num_sparse_fields)]
         self.fields = fields