Adding iostream.h thread-safety documentation. (#2995)

* Adding iostream.h thread-safety documentation.

* Restoring `TestThread` code with added `std::lock_guard<std::mutex>`.

* Updating new comments to reflect new information.

* Fixing up `git rebase -X theirs` accidents.
diff --git a/docs/advanced/pycpp/utilities.rst b/docs/advanced/pycpp/utilities.rst
index c15051f..8b8e519 100644
--- a/docs/advanced/pycpp/utilities.rst
+++ b/docs/advanced/pycpp/utilities.rst
@@ -47,6 +47,17 @@
         call_noisy_func();
     });
 
+.. warning::
+
+    The implementation in ``pybind11/iostream.h`` is NOT thread safe. Multiple
+    threads writing to a redirected ostream concurrently cause data races
+    and potentially buffer overflows. Therefore it is currrently a requirement
+    that all (possibly) concurrent redirected ostream writes are protected by
+    a mutex. #HelpAppreciated: Work on iostream.h thread safety. For more
+    background see the discussions under
+    `PR #2982 <https://github.com/pybind/pybind11/pull/2982>`_ and
+    `PR #2995 <https://github.com/pybind/pybind11/pull/2995>`_.
+
 This method respects flushes on the output streams and will flush if needed
 when the scoped guard is destroyed. This allows the output to be redirected in
 real time, such as to a Jupyter notebook. The two arguments, the C++ stream and
diff --git a/include/pybind11/iostream.h b/include/pybind11/iostream.h
index 89d1813..c98067a 100644
--- a/include/pybind11/iostream.h
+++ b/include/pybind11/iostream.h
@@ -5,6 +5,16 @@
 
     All rights reserved. Use of this source code is governed by a
     BSD-style license that can be found in the LICENSE file.
+
+    WARNING: The implementation in this file is NOT thread safe. Multiple
+    threads writing to a redirected ostream concurrently cause data races
+    and potentially buffer overflows. Therefore it is currrently a requirement
+    that all (possibly) concurrent redirected ostream writes are protected by
+    a mutex.
+    #HelpAppreciated: Work on iostream.h thread safety.
+    For more background see the discussions under
+    https://github.com/pybind/pybind11/pull/2982 and
+    https://github.com/pybind/pybind11/pull/2995.
 */
 
 #pragma once
@@ -85,30 +95,25 @@
         return remainder;
     }
 
-    // This function must be non-virtual to be called in a destructor. If the
-    // rare MSVC test failure shows up with this version, then this should be
-    // simplified to a fully qualified call.
+    // This function must be non-virtual to be called in a destructor.
     int _sync() {
         if (pbase() != pptr()) { // If buffer is not empty
             gil_scoped_acquire tmp;
-            // Placed inside gil_scoped_acquire as a mutex to avoid a race.
-            if (pbase() != pptr()) { // Check again under the lock
-                // This subtraction cannot be negative, so dropping the sign.
-                auto size        = static_cast<size_t>(pptr() - pbase());
-                size_t remainder = utf8_remainder();
+            // This subtraction cannot be negative, so dropping the sign.
+            auto size        = static_cast<size_t>(pptr() - pbase());
+            size_t remainder = utf8_remainder();
 
-                if (size > remainder) {
-                    str line(pbase(), size - remainder);
-                    pywrite(line);
-                    pyflush();
-                }
-
-                // Copy the remainder at the end of the buffer to the beginning:
-                if (remainder > 0)
-                    std::memmove(pbase(), pptr() - remainder, remainder);
-                setp(pbase(), epptr());
-                pbump(static_cast<int>(remainder));
+            if (size > remainder) {
+                str line(pbase(), size - remainder);
+                pywrite(line);
+                pyflush();
             }
+
+            // Copy the remainder at the end of the buffer to the beginning:
+            if (remainder > 0)
+                std::memmove(pbase(), pptr() - remainder, remainder);
+            setp(pbase(), epptr());
+            pbump(static_cast<int>(remainder));
         }
         return 0;
     }
diff --git a/tests/test_iostream.cpp b/tests/test_iostream.cpp
index 9087880..c620b59 100644
--- a/tests/test_iostream.cpp
+++ b/tests/test_iostream.cpp
@@ -15,6 +15,8 @@
 #include "pybind11_tests.h"
 #include <atomic>
 #include <iostream>
+#include <mutex>
+#include <string>
 #include <thread>
 
 void noisy_function(const std::string &msg, bool flush) {
@@ -35,8 +37,18 @@
 struct TestThread {
     TestThread() : stop_{false} {
         auto thread_f = [this] {
+            static std::mutex cout_mutex;
             while (!stop_) {
-                std::cout << "x" << std::flush;
+                {
+                    // #HelpAppreciated: Work on iostream.h thread safety.
+                    // Without this lock, the clang ThreadSanitizer (tsan) reliably reports a
+                    // data race, and this test is predictably flakey on Windows.
+                    // For more background see the discussion under
+                    // https://github.com/pybind/pybind11/pull/2982 and
+                    // https://github.com/pybind/pybind11/pull/2995.
+                    const std::lock_guard<std::mutex> lock(cout_mutex);
+                    std::cout << "x" << std::flush;
+                }
                 std::this_thread::sleep_for(std::chrono::microseconds(50));
             } };
         t_ = new std::thread(std::move(thread_f));