Switch to V2 collectives

There're still some performance issues, which don't seem to be blocking:
1) we don't have scoped allocator for v2 collective ops. The non scoped allocator concat/split adds about 4ms step time to bert pretrain
2) instance params is effectively leaked, but the effect is hidden by a far more serious leak from ScopedAllocatorContainer.
3) The overhead capture_call_time_value is a function of the number of collectives, and can cause serious performance issues if there're >100 collectives, which is uncommon.

PiperOrigin-RevId: 341885335
Change-Id: If8f773847b18fd5ae37a00e3a4c2563ec913b907
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index a5818c3..983e1db 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -108,8 +108,8 @@
   context.context().enable_collective_ops(server_def)
   # Recover default flag values.
   cross_device_ops_lib.CollectiveAllReduce._limited_nccl = True
-  cross_device_utils.CollectiveReplicaLauncher._use_scoped_allocator = True
-  cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = False
+  cross_device_utils.CollectiveReplicaLauncher._use_scoped_allocator = False
+  cross_device_utils.CollectiveReplicaLauncher._use_collective_v2 = True
   cross_device_utils.CollectiveReplicaLauncher._use_ordering_token = False
 
 
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index d90c3b7..f8090c5 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -257,8 +257,8 @@
 class CollectiveReplicaLauncher(object):
   """Launch collectives on one replica."""
 
-  _use_scoped_allocator = True
-  _use_collective_v2 = False
+  _use_scoped_allocator = False
+  _use_collective_v2 = True
   _use_ordering_token = False
 
   def __init__(self,