Addressing PR feedback
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index 98d028b..074db67 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -144,7 +144,7 @@
name = "rocm_tracer",
srcs = if_rocm(["rocm_tracer.cc"]),
hdrs = if_rocm(["rocm_tracer.h"]),
- copts = tf_copts(),
+ copts = tf_profiler_copts() + tf_copts(),
visibility = ["//visibility:public"],
deps = [
"//tensorflow/core:lib",
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
index d70b902..3ba5a92 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
@@ -243,6 +243,7 @@
EXPECT_GE(run_metadata.step_stats().dev_stats_size(), 1);
}
+#if TENSORFLOW_USE_ROCM
TEST_F(DeviceTracerTest, TraceToXSpace) {
auto tracer = CreateGpuTracer();
if (!tracer) return;
@@ -266,13 +267,8 @@
XSpace space;
TF_ASSERT_OK(tracer->CollectData(&space));
// At least one gpu plane and one host plane for launching events.
-#if GOOGLE_CUDA
- const XPlane* host_plane = FindPlaneWithName(space, kCuptiDriverApiPlaneName);
- ASSERT_NE(host_plane, nullptr);
-#elif TENSORFLOW_USE_ROCM
const XPlane* host_plane = FindPlaneWithName(space, kRoctracerApiPlaneName);
ASSERT_NE(host_plane, nullptr);
-#endif
const XPlane* device_plane =
FindPlaneWithName(space, strings::StrCat(kGpuPlanePrefix, 0));
@@ -283,14 +279,6 @@
EXPECT_GE(device_plane->event_metadata_size(), 5);
// Check if device capacity is serialized.
XPlaneVisitor plane = CreateTfXPlaneVisitor(device_plane);
-#if GOOGLE_CUDA
- EXPECT_TRUE(plane.GetStat(kDevCapClockRateKHz).has_value());
- EXPECT_TRUE(plane.GetStat(kDevCapCoreCount).has_value());
- EXPECT_TRUE(plane.GetStat(kDevCapMemoryBandwidth).has_value());
- EXPECT_TRUE(plane.GetStat(kDevCapMemorySize).has_value());
- EXPECT_TRUE(plane.GetStat(kDevCapComputeCapMajor).has_value());
- EXPECT_TRUE(plane.GetStat(kDevCapComputeCapMinor).has_value());
-#endif
// Check if the device events timestamps are set.
int total_events = 0;
@@ -303,6 +291,60 @@
});
EXPECT_GE(total_events, 5);
}
+#else // TENSORFLOW_USE_ROCM
+TEST_F(DeviceTracerTest, TraceToXSpace) {
+ auto tracer = CreateGpuTracer();
+ if (!tracer) return;
+
+ Initialize({3, 2, -1, 0});
+ auto session = CreateSession();
+ ASSERT_TRUE(session != nullptr);
+ TF_ASSERT_OK(session->Create(def_));
+ std::vector<std::pair<string, Tensor>> inputs;
+
+ // Request two targets: one fetch output and one non-fetched output.
+ std::vector<string> output_names = {y_ + ":0"};
+ std::vector<string> target_nodes = {y_neg_};
+ std::vector<Tensor> outputs;
+
+ TF_ASSERT_OK(tracer->Start());
+ Status s = session->Run(inputs, output_names, target_nodes, &outputs);
+ TF_ASSERT_OK(s);
+
+ TF_ASSERT_OK(tracer->Stop());
+ XSpace space;
+ TF_ASSERT_OK(tracer->CollectData(&space));
+ // At least one gpu plane and one host plane for launching events.
+ const XPlane* host_plane = FindPlaneWithName(space, kCuptiDriverApiPlaneName);
+ ASSERT_NE(host_plane, nullptr);
+
+ const XPlane* device_plane =
+ FindPlaneWithName(space, strings::StrCat(kGpuPlanePrefix, 0));
+ ASSERT_NE(device_plane, nullptr); // Check if device plane is serialized.
+ // one for MemcpyH2D, one for MemcpyD2H, two for Matmul (one from Eigen, one
+ // from cudnn), one for memset.
+ EXPECT_EQ(device_plane->event_metadata_size(), 5);
+ // Check if device capacity is serialized.
+ XPlaneVisitor plane = CreateTfXPlaneVisitor(device_plane);
+ EXPECT_TRUE(plane.GetStat(kDevCapClockRateKHz).has_value());
+ EXPECT_TRUE(plane.GetStat(kDevCapCoreCount).has_value());
+ EXPECT_TRUE(plane.GetStat(kDevCapMemoryBandwidth).has_value());
+ EXPECT_TRUE(plane.GetStat(kDevCapMemorySize).has_value());
+ EXPECT_TRUE(plane.GetStat(kDevCapComputeCapMajor).has_value());
+ EXPECT_TRUE(plane.GetStat(kDevCapComputeCapMinor).has_value());
+
+ // Check if the device events timestamps are set.
+ int total_events = 0;
+ plane.ForEachLine([&](const tensorflow::profiler::XLineVisitor& line) {
+ line.ForEachEvent([&](const tensorflow::profiler::XEventVisitor& event) {
+ EXPECT_GT(event.TimestampNs(), 0);
+ EXPECT_GT(event.DurationNs(), 0);
+ ++total_events;
+ });
+ });
+ EXPECT_GE(total_events, 5);
+}
+#endif // TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA
TEST_F(DeviceTracerTest, CudaRuntimeResource) {