/* Copyright 2019 Google LLC. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "ruy/pmu.h"

#include "ruy/check_macros.h"

#ifdef __linux__
#include <asm/unistd.h>
#include <linux/perf_event.h>
#include <sys/ioctl.h>
#include <syscall.h>
#include <unistd.h>

#include <cstdio>
#endif

#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <cstring>

namespace ruy {

// Linux-specific. Not ARM-specific.
#ifdef __linux__
class PerfEvent {
 public:
  PerfEvent(std::uint32_t type, std::uint64_t config) {
    perf_event_attr pe;
    memset(&pe, 0, sizeof(pe));
    pe.size = sizeof(pe);
    pe.type = type;
    pe.config = config;
    pe.disabled = 1;
    pe.exclude_kernel = 1;
    pe.exclude_hv = 1;
    pe.inherit = 1;
    fd_ = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
    if (fd_ == -1) {
      fprintf(stderr, "perf_event_open failed for config 0x%lx\n",
              static_cast<unsigned long>(config));
      // abort();
    }
  }

  ~PerfEvent() {
    RUY_CHECK(!started_);
    close(fd_);
  }

  void Start() {
    RUY_CHECK(!started_);
    started_ = true;
    ioctl(fd_, PERF_EVENT_IOC_RESET, 0);
    ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
    count_at_start_ = Read();
  }

  void Stop() {
    RUY_CHECK(started_);
    started_ = false;
    ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
    count_at_stop_ = Read();
  }

  std::int64_t Count() const {
    RUY_CHECK(!started_);
    return count_at_stop_ - count_at_start_;
  }

 private:
  std::int64_t Read() const {
    std::int64_t count;
    RUY_CHECK_NE(read(fd_, &count, sizeof(count)), -1);
    return count;
  }
  std::int64_t count_at_start_ = -1;
  std::int64_t count_at_stop_ = -1;
  bool started_ = false;
  int fd_ = -1;
};
#else
// Placeholder implementation to at least compile outside of linux.
#define PERF_TYPE_RAW 0
class PerfEvent {
 public:
  PerfEvent(std::uint32_t, std::uint64_t) {}
  ~PerfEvent() {}
  void Start() {}
  void Stop() {}
  std::int64_t Count() const { return 0; }
};
#endif

// ARM-specific. Query ARM PMU counters as Linux perf events using
// PERF_TYPE_RAW.
namespace arm_pmuv3 {

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-const-variable"

// These event numbers are listed in the ARMv8 architecture reference manual.
constexpr std::uint16_t L1I_CACHE_REFILL = 0x01;
constexpr std::uint16_t L1I_TLB_REFILL = 0x02;
constexpr std::uint16_t L1D_CACHE_REFILL = 0x03;
constexpr std::uint16_t L1D_CACHE = 0x04;
constexpr std::uint16_t L1D_TLB_REFILL = 0x05;
constexpr std::uint16_t LD_RETIRED = 0x06;
constexpr std::uint16_t ST_RETIRED = 0x07;
constexpr std::uint16_t INST_RETIRED = 0x08;
constexpr std::uint16_t EXC_TAKEN = 0x09;
constexpr std::uint16_t EXC_RETURN = 0x0A;
constexpr std::uint16_t CID_WRITE_RETIRED = 0x0B;
constexpr std::uint16_t PC_WRITE_RETIRED = 0x0C;
constexpr std::uint16_t BR_IMMED_RETIRED = 0x0D;
constexpr std::uint16_t BR_RETURN_RETIRED = 0x0E;
constexpr std::uint16_t UNALIGNED_LDST_RETIRED = 0x0F;
constexpr std::uint16_t BR_MIS_PRED = 0x10;
constexpr std::uint16_t CPU_CYCLES = 0x11;
constexpr std::uint16_t BR_PRED = 0x12;
constexpr std::uint16_t MEM_ACCESS = 0x13;
constexpr std::uint16_t L1I_CACHE = 0x14;
constexpr std::uint16_t L1D_CACHE_WB = 0x15;
constexpr std::uint16_t L2D_CACHE = 0x16;
constexpr std::uint16_t L2D_CACHE_REFILL = 0x17;
constexpr std::uint16_t L2D_CACHE_WB = 0x18;
constexpr std::uint16_t BUS_ACCESS = 0x19;
constexpr std::uint16_t MEMORY_ERROR = 0x1A;
constexpr std::uint16_t INST_SPEC = 0x1B;
constexpr std::uint16_t TTBR_WRITE_RETIRED = 0x1C;
constexpr std::uint16_t BUS_CYCLES = 0x1D;
constexpr std::uint16_t CHAIN = 0x1E;
constexpr std::uint16_t L1D_CACHE_ALLOCATE = 0x1F;
constexpr std::uint16_t L2D_CACHE_ALLOCATE = 0x20;
constexpr std::uint16_t BR_RETIRED = 0x21;
constexpr std::uint16_t BR_MIS_PRED_RETIRED = 0x22;
constexpr std::uint16_t STALL_FRONTEND = 0x23;
constexpr std::uint16_t STALL_BACKEND = 0x24;
constexpr std::uint16_t L1D_TLB = 0x25;
constexpr std::uint16_t L1I_TLB = 0x26;
constexpr std::uint16_t L2I_CACHE = 0x27;
constexpr std::uint16_t L2I_CACHE_REFILL = 0x28;
constexpr std::uint16_t L3D_CACHE_ALLOCATE = 0x29;
constexpr std::uint16_t L3D_CACHE_REFILL = 0x2A;
constexpr std::uint16_t L3D_CACHE = 0x2B;
constexpr std::uint16_t L3D_CACHE_WB = 0x2C;
constexpr std::uint16_t L2D_TLB_REFILL = 0x2D;
constexpr std::uint16_t L2I_TLB_REFILL = 0x2E;
constexpr std::uint16_t L2D_TLB = 0x2F;
constexpr std::uint16_t L2I_TLB = 0x30;
constexpr std::uint16_t LL_CACHE = 0x32;
constexpr std::uint16_t LL_CACHE_MISS = 0x33;
constexpr std::uint16_t DTLB_WALK = 0x34;
constexpr std::uint16_t LL_CACHE_RD = 0x36;
constexpr std::uint16_t LL_CACHE_MISS_RD = 0x37;

// Additional implementation-defined events found by googling around.
constexpr std::uint16_t L1D_CACHE_RD = 0x40;
constexpr std::uint16_t L1D_CACHE_REFILL_RD = 0x42;
constexpr std::uint16_t L1D_TLB_REFILL_RD = 0x4C;
constexpr std::uint16_t L1D_TLB_RD = 0x4E;
constexpr std::uint16_t L2D_CACHE_RD = 0x50;
constexpr std::uint16_t L2D_CACHE_REFILL_RD = 0x52;
constexpr std::uint16_t BUS_ACCESS_RD = 0x60;
constexpr std::uint16_t MEM_ACCESS_RD = 0x66;
constexpr std::uint16_t L3D_CACHE_RD = 0xA0;
constexpr std::uint16_t L3D_CACHE_REFILL_RD = 0xA2;

#pragma GCC diagnostic pop

}  // namespace arm_pmuv3

class PmuEventsPrivate {
 public:
  PmuEventsPrivate()
      : l1d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_CACHE_REFILL),
        l2d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_CACHE_REFILL),
        l3d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L3D_CACHE_REFILL),
        ll_cache_miss(PERF_TYPE_RAW, arm_pmuv3::LL_CACHE_MISS),
        l1d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_TLB_REFILL),
        l2d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_TLB_REFILL),
        stall_frontend(PERF_TYPE_RAW, arm_pmuv3::STALL_FRONTEND),
        stall_backend(PERF_TYPE_RAW, arm_pmuv3::STALL_BACKEND),
        br_mis_pred(PERF_TYPE_RAW, arm_pmuv3::BR_MIS_PRED),
        l1d_cache_writeback(PERF_TYPE_RAW, arm_pmuv3::L1D_CACHE_WB),
        l2d_cache_writeback(PERF_TYPE_RAW, arm_pmuv3::L2D_CACHE_WB) {}

 private:
  friend class PmuEvents;
  PerfEvent l1d_cache_refill;
  PerfEvent l2d_cache_refill;
  PerfEvent l3d_cache_refill;
  PerfEvent ll_cache_miss;
  PerfEvent l1d_tlb_refill;
  PerfEvent l2d_tlb_refill;
  PerfEvent stall_frontend;
  PerfEvent stall_backend;
  PerfEvent br_mis_pred;
  PerfEvent l1d_cache_writeback;
  PerfEvent l2d_cache_writeback;
};

PmuEvents::PmuEvents() : priv(new PmuEventsPrivate) {}
PmuEvents::~PmuEvents() { delete priv; }

void PmuEvents::StartRecording() {
  priv->l1d_cache_refill.Start();
  priv->l2d_cache_refill.Start();
  priv->l3d_cache_refill.Start();
  priv->ll_cache_miss.Start();
  priv->l1d_tlb_refill.Start();
  priv->l2d_tlb_refill.Start();
  priv->stall_frontend.Start();
  priv->stall_backend.Start();
  priv->br_mis_pred.Start();
  priv->l1d_cache_writeback.Start();
  priv->l2d_cache_writeback.Start();
}

void PmuEvents::StopRecording() {
  priv->l1d_cache_refill.Stop();
  priv->l2d_cache_refill.Stop();
  priv->l3d_cache_refill.Stop();
  priv->ll_cache_miss.Stop();
  priv->l1d_tlb_refill.Stop();
  priv->l2d_tlb_refill.Stop();
  priv->stall_frontend.Stop();
  priv->stall_backend.Stop();
  priv->br_mis_pred.Stop();
  priv->l1d_cache_writeback.Stop();
  priv->l2d_cache_writeback.Stop();
}

float PmuEvents::BranchMispredictionCount() const {
  return static_cast<float>(priv->br_mis_pred.Count());
}

float PmuEvents::FrontendStallCount() const {
  return static_cast<float>(priv->stall_frontend.Count());
}

float PmuEvents::BackendStallCount() const {
  return static_cast<float>(priv->stall_backend.Count());
}

float PmuEvents::L1RefillCount() const {
  return static_cast<float>(priv->l1d_cache_refill.Count());
}

float PmuEvents::L2RefillCount() const {
  return static_cast<float>(priv->l2d_cache_refill.Count());
}

float PmuEvents::L3RefillCount() const {
  // Important: this was discovered in the context of the above experiments,
  // which also tested the _RD variants of these counters. So it's possible that
  // it's just not needed here with the default (non _RD) counters.
  //
  // Some CPUs implement LL_CACHE_MISS[_RD], some implement
  // L3D_CACHE_REFILL[_RD]. It seems that either one of these two counters is
  // zero, or they roughly both agree with each other. Therefore, taking the max
  // of them is a reasonable way to get something more portable across various
  // CPUs.
  return static_cast<float>(
      std::max(priv->l3d_cache_refill.Count(), priv->ll_cache_miss.Count()));
}

float PmuEvents::L1TLBRefillCount() const {
  return static_cast<float>(priv->l1d_tlb_refill.Count());
}

float PmuEvents::L2TLBRefillCount() const {
  return static_cast<float>(priv->l2d_tlb_refill.Count());
}

float PmuEvents::L1WritebackCount() const {
  return static_cast<float>(priv->l1d_cache_writeback.Count());
}

float PmuEvents::L2WritebackCount() const {
  return static_cast<float>(priv->l2d_cache_writeback.Count());
}

}  // namespace ruy
