Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NewComm] Set use_new_comm true in default. #58073

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
37f31d7
[NewComm] Set Flags_dynamic_static_unified_comm `True` in default.
GhostScreaming Sep 26, 2023
515d9fe
Polish code.
GhostScreaming Sep 26, 2023
23698a0
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
GhostScreaming Sep 27, 2023
3375579
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
GhostScreaming Sep 27, 2023
6c6d349
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
GhostScreaming Sep 28, 2023
c7e8ca3
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
GhostScreaming Oct 9, 2023
871df0e
Fix problems of distributed testcases using new comm lib.
GhostScreaming Oct 9, 2023
8176b02
Fix problems of testcases using new comm lib in default.
GhostScreaming Oct 10, 2023
9c8edd3
Fix failed testcase.
GhostScreaming Oct 10, 2023
f7730d7
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
GhostScreaming Oct 11, 2023
a80666b
Fix falied testcases.
GhostScreaming Oct 11, 2023
316c01d
[NewComm] Set use_new_comm in default.
GhostScreaming Oct 13, 2023
c1ec85d
[NewComm] Set use_new_comm in default.
GhostScreaming Oct 13, 2023
6800682
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
GhostScreaming Oct 13, 2023
f440900
Fix some problems.
GhostScreaming Oct 16, 2023
9c9f9d6
Fix problems of testcases.
GhostScreaming Oct 17, 2023
49ea8d0
Disable test_fleet_private_function testcase.
GhostScreaming Oct 18, 2023
4ae3b18
Remove test_fleet_private_function testcase.
GhostScreaming Oct 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License

import hashlib
import os
from collections import OrderedDict

import paddle
Expand Down Expand Up @@ -158,10 +157,10 @@ def instantiate(self):
strategy.nrings = 1
if core.is_compiled_with_cuda():
place = core.CUDAPlace(genv.device_id)
use_new_comm = os.getenv(
"FLAGS_dynamic_static_unified_comm", "0"
)
if use_new_comm in ["1", "True", "true"]:
use_new_comm = paddle.get_flags(
"FLAGS_dynamic_static_unified_comm"
)["FLAGS_dynamic_static_unified_comm"]
if use_new_comm:
store = core.create_or_get_global_tcp_store()
endpoints_str = ""
for endpoint in strategy.trainer_endpoints:
Expand Down
11 changes: 11 additions & 0 deletions python/paddle/distributed/fleet/base/private_helper_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import time
from contextlib import closing

import paddle

__all__ = []


Expand All @@ -33,6 +35,15 @@ def wait_server_ready(endpoints):

>>> wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
"""
try:
use_new_comm = paddle.get_flags("FLAGS_dynamic_static_unified_comm")[
"FLAGS_dynamic_static_unified_comm"
]
except:
use_new_comm = False

if use_new_comm:
return
assert not isinstance(endpoints, str)
while True:
all_ok = True
Expand Down
7 changes: 4 additions & 3 deletions python/paddle/distributed/fleet/meta_optimizers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import paddle
from paddle.framework import core
Expand Down Expand Up @@ -99,8 +98,10 @@ def _init_communicator(
other_endpoints.remove(current_endpoint)

if rank == 0 and wait_port:
use_new_comm = os.getenv("FLAGS_dynamic_static_unified_comm", "0")
if use_new_comm not in [1, "1", "True", "true"]:
use_new_comm = paddle.get_flags(
"FLAGS_dynamic_static_unified_comm"
)["FLAGS_dynamic_static_unified_comm"]
if not use_new_comm:
wait_server_ready(other_endpoints)

def _add_sync_by_allreduce(block):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import os

import paddle
from paddle.base import core
from paddle.incubate.optimizer import PipelineOptimizer
from paddle.static import (
Expand Down Expand Up @@ -714,8 +715,10 @@ def minimize_impl(
self._recreate_not_persist_param_as_var()

self._dump_program_for_debug()
use_new_comm = os.getenv("FLAGS_dynamic_static_unified_comm", "0")
if use_new_comm not in ["1", "True", "true"]:
use_new_comm = paddle.get_flags("FLAGS_dynamic_static_unified_comm")[
"FLAGS_dynamic_static_unified_comm"
]
if not use_new_comm:
self._wait()
return optimize_ops, params_grads

Expand Down
5 changes: 0 additions & 5 deletions test/collective/fleet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -680,11 +680,6 @@ if((WITH_GPU OR WITH_XPU) AND (LINUX OR WIN32))
test_fleet_recompute_meta_optimizer ENVS
"http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
endif()
if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
py_test_modules(
test_fleet_private_function MODULES test_fleet_private_function ENVS
"http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
endif()
if((WITH_GPU OR WITH_XPU) AND LOCAL_ALL_PLAT)
bash_test_modules(
test_new_group
Expand Down
47 changes: 0 additions & 47 deletions test/collective/fleet/test_fleet_private_function.py

This file was deleted.

1 change: 0 additions & 1 deletion test/collective/fleet/testslist.csv
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ test_parallel_dygraph_sparse_embedding_over_height,,ROCM,350,DIST,../../legacy_t
test_distributed_strategy,LINUX;APPLE,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
test_auto_parallel_parallelizer,,,120,DIST,../../legacy_test/dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
test_fleet_recompute_meta_optimizer,LINUX;WIN32,GPU;XPU,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
test_fleet_private_function,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
test_new_group,,GPU;XPU,,DIST,test_new_group.sh,2,,http_proxy=;https_proxy=,
test_c_comm_init_op,LINUX,GPU;XPU,120,DIST,test_c_comm_init_op.sh,2,,http_proxy=;https_proxy=,
test_fused_attention_pass_with_mp,LINUX,GPU,120,DIST,test_fused_attention_pass_with_mp.sh,2,,http_proxy=;https_proxy=,
Expand Down
2 changes: 0 additions & 2 deletions tools/parallel_UT_rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,6 @@
'test_depthwise_conv_mkldnn_pass',
'test_fleet_metric',
'test_fc_fuse_pass_cc',
'test_fleet_private_function',
'test_fleet',
'test_executor_check_feed',
'test_py_reader_lod_level_share',
Expand Down Expand Up @@ -2121,7 +2120,6 @@
'test_dgc_optimizer',
'heter_server_test',
'test_custom_conj',
'test_fleet_private_function',
'test_fake_init_op',
'brpc_service_sparse_sgd_test',
'test_tf32_cudnn',
Expand Down