Vector Downward GetSize optimization (#6925)

* Added Google benchmarks (and gtests)

* Separate benchmark CMakeLists.txt to its own file

* Move output directory to target just flatbenchmark

* Reduced from encoding 210ns -> 188ns

* store size_ as uoffset_t

* fixed windows c4267 warning
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 3d3032c..8d49c35 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -37,6 +37,7 @@
     ${CPP_BENCH_DIR}/benchmark_main.cpp
     ${CPP_FB_BENCH_DIR}/fb_bench.cpp
     ${CPP_RAW_BENCH_DIR}/raw_bench.cpp
+    ${CPP_BENCH_FB_GEN}
 )
 
 # Generate the flatbuffers benchmark code from the flatbuffers schema using
diff --git a/benchmarks/cpp/benchmark_main.cpp b/benchmarks/cpp/benchmark_main.cpp
index 720f94e..63807a5 100644
--- a/benchmarks/cpp/benchmark_main.cpp
+++ b/benchmarks/cpp/benchmark_main.cpp
@@ -93,4 +93,4 @@
   std::unique_ptr<Bench> bench = NewRawBench();
   Use(state, bench, buffer, 218812692406581874);
 }
-BENCHMARK(BM_Raw_Use);
\ No newline at end of file
+BENCHMARK(BM_Raw_Use);
diff --git a/include/flatbuffers/flatbuffer_builder.h b/include/flatbuffers/flatbuffer_builder.h
index 7db48e3..ce40f7b 100644
--- a/include/flatbuffers/flatbuffer_builder.h
+++ b/include/flatbuffers/flatbuffer_builder.h
@@ -285,20 +285,20 @@
     FieldLoc fl = { off, field };
     buf_.scratch_push_small(fl);
     num_field_loc++;
-    max_voffset_ = (std::max)(max_voffset_, field);
+    if (field > max_voffset_) {
+      max_voffset_ = field;
+    }
   }
 
   // Like PushElement, but additionally tracks the field this represents.
   template<typename T> void AddElement(voffset_t field, T e, T def) {
     // We don't serialize values equal to the default.
     if (IsTheSameAs(e, def) && !force_defaults_) return;
-    auto off = PushElement(e);
-    TrackField(field, off);
+    TrackField(field, PushElement(e));
   }
 
   template<typename T> void AddElement(voffset_t field, T e) {
-    auto off = PushElement(e);
-    TrackField(field, off);
+    TrackField(field, PushElement(e));
   }
 
   template<typename T> void AddOffset(voffset_t field, Offset<T> off) {
@@ -324,8 +324,9 @@
     // Align to ensure GetSize() below is correct.
     Align(sizeof(uoffset_t));
     // Offset must refer to something already in buffer.
-    FLATBUFFERS_ASSERT(off && off <= GetSize());
-    return GetSize() - off + static_cast<uoffset_t>(sizeof(uoffset_t));
+    const uoffset_t size = GetSize();
+    FLATBUFFERS_ASSERT(off && off <= size);
+    return size - off + static_cast<uoffset_t>(sizeof(uoffset_t));
   }
 
   void NotNested() {
diff --git a/include/flatbuffers/vector_downward.h b/include/flatbuffers/vector_downward.h
index 8088b10..3391391 100644
--- a/include/flatbuffers/vector_downward.h
+++ b/include/flatbuffers/vector_downward.h
@@ -38,6 +38,7 @@
         initial_size_(initial_size),
         buffer_minalign_(buffer_minalign),
         reserved_(0),
+        size_(0),
         buf_(nullptr),
         cur_(nullptr),
         scratch_(nullptr) {}
@@ -49,6 +50,7 @@
         initial_size_(other.initial_size_),
         buffer_minalign_(other.buffer_minalign_),
         reserved_(other.reserved_),
+        size_(other.size_),
         buf_(other.buf_),
         cur_(other.cur_),
         scratch_(other.scratch_) {
@@ -86,6 +88,7 @@
       reserved_ = 0;
       cur_ = nullptr;
     }
+    size_ = 0;
     clear_scratch();
   }
 
@@ -139,17 +142,18 @@
   }
 
   inline uint8_t *make_space(size_t len) {
-    size_t space = ensure_space(len);
-    cur_ -= space;
+    if (len) {
+      ensure_space(len);
+      cur_ -= len;
+      size_ += static_cast<uoffset_t>(len);
+    }
     return cur_;
   }
 
   // Returns nullptr if using the DefaultAllocator.
   Allocator *get_custom_allocator() { return allocator_; }
 
-  uoffset_t size() const {
-    return static_cast<uoffset_t>(reserved_ - static_cast<size_t>(cur_ - buf_));
-  }
+  inline uoffset_t size() const { return size_; }
 
   uoffset_t scratch_size() const {
     return static_cast<uoffset_t>(scratch_ - buf_);
@@ -203,7 +207,11 @@
     memset(make_space(zero_pad_bytes), 0, zero_pad_bytes);
   }
 
-  void pop(size_t bytes_to_remove) { cur_ += bytes_to_remove; }
+  void pop(size_t bytes_to_remove) {
+    cur_ += bytes_to_remove;
+    size_ -= static_cast<uoffset_t>(bytes_to_remove);
+  }
+
   void scratch_pop(size_t bytes_to_remove) { scratch_ -= bytes_to_remove; }
 
   void swap(vector_downward &other) {
@@ -213,6 +221,7 @@
     swap(initial_size_, other.initial_size_);
     swap(buffer_minalign_, other.buffer_minalign_);
     swap(reserved_, other.reserved_);
+    swap(size_, other.size_);
     swap(buf_, other.buf_);
     swap(cur_, other.cur_);
     swap(scratch_, other.scratch_);
@@ -234,6 +243,7 @@
   size_t initial_size_;
   size_t buffer_minalign_;
   size_t reserved_;
+  uoffset_t size_;
   uint8_t *buf_;
   uint8_t *cur_;  // Points at location between empty (below) and used (above).
   uint8_t *scratch_;  // Points to the end of the scratchpad in use.