Switch to V2 collectives
There're still some performance issues, which don't seem to be blocking:
1) we don't have scoped allocator for v2 collective ops. The non scoped allocator concat/split adds about 4ms step time to bert pretrain
2) instance params is effectively leaked, but the effect is hidden by a far more serious leak from ScopedAllocatorContainer.
3) The overhead capture_call_time_value is a function of the number of collectives, and can cause serious performance issues if there're >100 collectives, which is uncommon.
PiperOrigin-RevId: 341885335
Change-Id: If8f773847b18fd5ae37a00e3a4c2563ec913b907
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index a5818c3..983e1db 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -108,8 +108,8 @@
context.context().enable_collective_ops(server_def)
# Recover default flag values.
cross_device_ops_lib.CollectiveAllReduce._limited_nccl = True
- cross_device_utils.CollectiveReplicaLauncher._use_scoped_allocator = True
- cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = False
+ cross_device_utils.CollectiveReplicaLauncher._use_scoped_allocator = False
+ cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = True
cross_device_utils.CollectiveReplicaLauncher._use_ordering_token = False
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index d90c3b7..f8090c5 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -257,8 +257,8 @@
class CollectiveReplicaLauncher(object):
"""Launch collectives on one replica."""
- _use_scoped_allocator = True
- _use_collective_v2 = False
+ _use_scoped_allocator = False
+ _use_collective_v2 = True
_use_ordering_token = False
def __init__(self,