You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
For training questions about dlrm_recommendation_v2, The data I use is criteo fake data generated by the inference part of DLRM.
The environment I'm using is cuda python3.8; pip install -r requirments.txt
I use the following commands:
export TOTAL_TRAINING_SAMPLES=4195197692 ;
export BATCHSIZE=65536 ;
export WORLD_SIZE=2 ;
torchx run -s local_cwd dist.ddp -j 1x2 --script dlrm_main.py -- --embedding_dim 128 --dense_arch_layer_sizes 512,256,128 --over_arch_layer_sizes 1024,1024,512,256,1 --in_memory_binary_criteo_path ../fake_criteo/numpy_contiguous_shuffled_output_dataset_dir/ --num_embeddings_per_feature 40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36 --validation_freq_within_epoch $((TOTAL_TRAINING_SAMPLES / (BATCHSIZE * 40))) --epochs 1 --pin_memory --mmap_mode --batch_size $((GLOBAL_BATCH_SIZE / WORLD_SIZE)) --interaction_type=dcn --dcn_num_layers=3 --dcn_low_rank_dim=512 --adagrad --learning_rate 0.005 --multi_hot_distribution_type uniform --multi_hot_sizes=3,2,1,2,6,1,1,1,1,7,3,8,1,6,9,5,1,1,1,12,100,27,10,3,1,1
The error is as follows:
dlrm_main/0 [1]:Traceback (most recent call last):
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torch/_ops.py", line 501, in getattr
dlrm_main/0 [1]: op, overload_names = torch._C._jit_get_operation(qualified_op_name)
dlrm_main/0 [1]:RuntimeError: No such operator fbgemm::new_managed_tensor
dlrm_main/0 [1]:
dlrm_main/0 [1]:The above exception was the direct cause of the following exception:
dlrm_main/0 [1]:
dlrm_main/0 [1]:Traceback (most recent call last):
dlrm_main/0 [1]: File "dlrm_main.py", line 998, in
dlrm_main/0 [1]: main(sys.argv[1:])
dlrm_main/0 [1]: File "dlrm_main.py", line 868, in main
dlrm_main/0 [1]: model = DistributedModelParallel(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 225, in init
dlrm_main/0 [1]: self._dmp_wrapped_module: nn.Module = self._init_dmp(module)
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 283, in _init_dmp
dlrm_main/0 [1]: return self._shard_modules_impl(module)
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 330, in _shard_modules_impl
dlrm_main/0 [1]: child = self._shard_modules_impl(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 330, in _shard_modules_impl
dlrm_main/0 [1]: child = self._shard_modules_impl(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 330, in _shard_modules_impl
dlrm_main/0 [1]: child = self._shard_modules_impl(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 321, in _shard_modules_impl
dlrm_main/0 [1]: module = self._sharder_map[sharder_key].shard(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embeddingbag.py", line 570, in shard
dlrm_main/0 [1]: return ShardedEmbeddingBagCollection(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embeddingbag.py", line 332, in init
dlrm_main/0 [1]: self._create_lookups()
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embeddingbag.py", line 394, in _create_lookups
dlrm_main/0 [1]: self._lookups.append(sharding.create_lookup())
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/sharding/tw_sharding.py", line 397, in create_lookup
dlrm_main/0 [1]: return GroupedPooledEmbeddingsLookup(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embedding_lookup.py", line 232, in init
dlrm_main/0 [1]: self._emb_modules.append(_create_lookup(config, device))
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embedding_lookup.py", line 219, in _create_lookup
dlrm_main/0 [1]: return BatchedFusedEmbeddingBag(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/batched_embedding_kernel.py", line 612, in init
dlrm_main/0 [1]: SplitTableBatchedEmbeddingBagsCodegen(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/fbgemm_gpu/split_table_batched_embeddings_ops.py", line 356, in init
dlrm_main/0 [1]: self._apply_split(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/fbgemm_gpu/split_table_batched_embeddings_ops.py", line 1167, in _apply_split
dlrm_main/0 [1]: out=torch.ops.fbgemm.new_managed_tensor(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torch/_ops.py", line 505, in getattr
dlrm_main/0 [1]: raise AttributeError(
dlrm_main/0 [1]:AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'new_managed_tensor'
The text was updated successfully, but these errors were encountered:
For training questions about dlrm_recommendation_v2, The data I use is criteo fake data generated by the inference part of DLRM.$((TOTAL_TRAINING_SAMPLES / (BATCHSIZE * 40))) --epochs 1 --pin_memory --mmap_mode --batch_size $ ((GLOBAL_BATCH_SIZE / WORLD_SIZE)) --interaction_type=dcn --dcn_num_layers=3 --dcn_low_rank_dim=512 --adagrad --learning_rate 0.005 --multi_hot_distribution_type uniform --multi_hot_sizes=3,2,1,2,6,1,1,1,1,7,3,8,1,6,9,5,1,1,1,12,100,27,10,3,1,1
The environment I'm using is cuda python3.8; pip install -r requirments.txt
I use the following commands:
export TOTAL_TRAINING_SAMPLES=4195197692 ;
export BATCHSIZE=65536 ;
export WORLD_SIZE=2 ;
torchx run -s local_cwd dist.ddp -j 1x2 --script dlrm_main.py -- --embedding_dim 128 --dense_arch_layer_sizes 512,256,128 --over_arch_layer_sizes 1024,1024,512,256,1 --in_memory_binary_criteo_path ../fake_criteo/numpy_contiguous_shuffled_output_dataset_dir/ --num_embeddings_per_feature 40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36 --validation_freq_within_epoch
The error is as follows:
dlrm_main/0 [1]:Traceback (most recent call last):
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torch/_ops.py", line 501, in getattr
dlrm_main/0 [1]: op, overload_names = torch._C._jit_get_operation(qualified_op_name)
dlrm_main/0 [1]:RuntimeError: No such operator fbgemm::new_managed_tensor
dlrm_main/0 [1]:
dlrm_main/0 [1]:The above exception was the direct cause of the following exception:
dlrm_main/0 [1]:
dlrm_main/0 [1]:Traceback (most recent call last):
dlrm_main/0 [1]: File "dlrm_main.py", line 998, in
dlrm_main/0 [1]: main(sys.argv[1:])
dlrm_main/0 [1]: File "dlrm_main.py", line 868, in main
dlrm_main/0 [1]: model = DistributedModelParallel(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 225, in init
dlrm_main/0 [1]: self._dmp_wrapped_module: nn.Module = self._init_dmp(module)
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 283, in _init_dmp
dlrm_main/0 [1]: return self._shard_modules_impl(module)
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 330, in _shard_modules_impl
dlrm_main/0 [1]: child = self._shard_modules_impl(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 330, in _shard_modules_impl
dlrm_main/0 [1]: child = self._shard_modules_impl(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 330, in _shard_modules_impl
dlrm_main/0 [1]: child = self._shard_modules_impl(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/model_parallel.py", line 321, in _shard_modules_impl
dlrm_main/0 [1]: module = self._sharder_map[sharder_key].shard(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embeddingbag.py", line 570, in shard
dlrm_main/0 [1]: return ShardedEmbeddingBagCollection(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embeddingbag.py", line 332, in init
dlrm_main/0 [1]: self._create_lookups()
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embeddingbag.py", line 394, in _create_lookups
dlrm_main/0 [1]: self._lookups.append(sharding.create_lookup())
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/sharding/tw_sharding.py", line 397, in create_lookup
dlrm_main/0 [1]: return GroupedPooledEmbeddingsLookup(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embedding_lookup.py", line 232, in init
dlrm_main/0 [1]: self._emb_modules.append(_create_lookup(config, device))
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/embedding_lookup.py", line 219, in _create_lookup
dlrm_main/0 [1]: return BatchedFusedEmbeddingBag(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torchrec/distributed/batched_embedding_kernel.py", line 612, in init
dlrm_main/0 [1]: SplitTableBatchedEmbeddingBagsCodegen(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/fbgemm_gpu/split_table_batched_embeddings_ops.py", line 356, in init
dlrm_main/0 [1]: self._apply_split(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/fbgemm_gpu/split_table_batched_embeddings_ops.py", line 1167, in _apply_split
dlrm_main/0 [1]: out=torch.ops.fbgemm.new_managed_tensor(
dlrm_main/0 [1]: File "/opt/miniconda3/envs/dlrm_training_py38/lib/python3.8/site-packages/torch/_ops.py", line 505, in getattr
dlrm_main/0 [1]: raise AttributeError(
dlrm_main/0 [1]:AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'new_managed_tensor'
The text was updated successfully, but these errors were encountered: