src/hotspot/cpu/sparc/memset_with_concurrent_readers_sparc.cpp - platform/libcore - Git at Google

 /*
  * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  *
  */

 #include "precompiled.hpp"

 #include "asm/macroAssembler.inline.hpp"
 #include "gc/shared/memset_with_concurrent_readers.hpp"
 #include "runtime/prefetch.inline.hpp"
 #include "utilities/align.hpp"
 #include "utilities/debug.hpp"
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/macros.hpp"

 // An implementation of memset, for use when there may be concurrent
 // readers of the region being stored into.
 //
 // We can't use the standard library memset if it is implemented using
 // block initializing stores.  Doing so can result in concurrent readers
 // seeing spurious zeros.
 //
 // We can't use the obvious C/C++ for-loop, because the compiler may
 // recognize the idiomatic loop and optimize it into a call to the
 // standard library memset; we've seen exactly this happen with, for
 // example, Solaris Studio 12.3.  Hence the use of inline assembly
 // code, hiding loops from the compiler's optimizer.
 //
 // We don't attempt to use the standard library memset when it is safe
 // to do so.  We could conservatively do so by detecting the presence
 // of block initializing stores (VM_Version::has_blk_init()), but the
 // implementation provided here should be sufficient.

 inline void fill_subword(void* start, void* end, int value) {
   STATIC_ASSERT(BytesPerWord == 8);
   assert(pointer_delta(end, start, 1) < (size_t)BytesPerWord, "precondition");
   // Dispatch on (end - start).
   void* pc;
   __asm__ volatile(
     // offset := (7 - (end - start)) + 3
     //   3 instructions from rdpc to DISPATCH
     " sub %[offset], %[end], %[offset]\n\t" // offset := start - end
     " sllx %[offset], 2, %[offset]\n\t" // scale offset for instruction size of 4
     " add %[offset], 40, %[offset]\n\t" // offset += 10 * instruction size
     " rd %%pc, %[pc]\n\t"               // dispatch on scaled offset
     " jmpl %[pc]+%[offset], %%g0\n\t"
     "  nop\n\t"
     // DISPATCH: no direct reference, but without it the store block may be elided.
     "1:\n\t"
     " stb %[value], [%[end]-7]\n\t" // end[-7] = value
     " stb %[value], [%[end]-6]\n\t"
     " stb %[value], [%[end]-5]\n\t"
     " stb %[value], [%[end]-4]\n\t"
     " stb %[value], [%[end]-3]\n\t"
     " stb %[value], [%[end]-2]\n\t"
     " stb %[value], [%[end]-1]\n\t" // end[-1] = value
     : /* only temporaries/overwritten outputs */
       [pc] "=&r" (pc),               // temp
       [offset] "+&r" (start)
     : [end] "r" (end),
       [value] "r" (value)
     : "memory");
 }

 void memset_with_concurrent_readers(void* to, int value, size_t size) {
   Prefetch::write(to, 0);
   void* end = static_cast<char*>(to) + size;
   if (size >= (size_t)BytesPerWord) {
     // Fill any partial word prefix.
     uintx* aligned_to = static_cast<uintx*>(align_up(to, BytesPerWord));
     fill_subword(to, aligned_to, value);

     // Compute fill word.
     STATIC_ASSERT(BitsPerByte == 8);
     STATIC_ASSERT(BitsPerWord == 64);
     uintx xvalue = value & 0xff;
     xvalue |= (xvalue << 8);
     xvalue |= (xvalue << 16);
     xvalue |= (xvalue << 32);

     uintx* aligned_end = static_cast<uintx*>(align_down(end, BytesPerWord));
     assert(aligned_to <= aligned_end, "invariant");

     // for ( ; aligned_to < aligned_end; ++aligned_to) {
     //   *aligned_to = xvalue;
     // }
     uintptr_t temp;
     __asm__ volatile(
       // Unroll loop x8.
       " sub %[aend], %[ato], %[temp]\n\t"
       " cmp %[temp], 56\n\t"           // cc := (aligned_end - aligned_to) > 7 words
       " ba %%xcc, 2f\n\t"              // goto TEST always
       "  sub %[aend], 56, %[temp]\n\t" // limit := aligned_end - 7 words
       // LOOP:
       "1:\n\t"                         // unrolled x8 store loop top
       " cmp %[temp], %[ato]\n\t"       // cc := limit > (next) aligned_to
       " stx %[xvalue], [%[ato]-64]\n\t" // store 8 words, aligned_to pre-incremented
       " stx %[xvalue], [%[ato]-56]\n\t"
       " stx %[xvalue], [%[ato]-48]\n\t"
       " stx %[xvalue], [%[ato]-40]\n\t"
       " stx %[xvalue], [%[ato]-32]\n\t"
       " stx %[xvalue], [%[ato]-24]\n\t"
       " stx %[xvalue], [%[ato]-16]\n\t"
       " stx %[xvalue], [%[ato]-8]\n\t"
       // TEST:
       "2:\n\t"
       " bgu,a %%xcc, 1b\n\t"           // goto LOOP if more than 7 words remaining
       "  add %[ato], 64, %[ato]\n\t"   // aligned_to += 8, for next iteration
       // Fill remaining < 8 full words.
       // Dispatch on (aligned_end - aligned_to).
       // offset := (7 - (aligned_end - aligned_to)) + 3
       //   3 instructions from rdpc to DISPATCH
       " sub %[ato], %[aend], %[ato]\n\t" // offset := aligned_to - aligned_end
       " srax %[ato], 1, %[ato]\n\t"      // scale offset for instruction size of 4
       " add %[ato], 40, %[ato]\n\t"      // offset += 10 * instruction size
       " rd %%pc, %[temp]\n\t"            // dispatch on scaled offset
       " jmpl %[temp]+%[ato], %%g0\n\t"
       "  nop\n\t"
       // DISPATCH: no direct reference, but without it the store block may be elided.
       "3:\n\t"
       " stx %[xvalue], [%[aend]-56]\n\t" // aligned_end[-7] = xvalue
       " stx %[xvalue], [%[aend]-48]\n\t"
       " stx %[xvalue], [%[aend]-40]\n\t"
       " stx %[xvalue], [%[aend]-32]\n\t"
       " stx %[xvalue], [%[aend]-24]\n\t"
       " stx %[xvalue], [%[aend]-16]\n\t"
       " stx %[xvalue], [%[aend]-8]\n\t"  // aligned_end[-1] = xvalue
       : /* only temporaries/overwritten outputs */
         [temp] "=&r" (temp),
         [ato] "+&r" (aligned_to)
       : [aend] "r" (aligned_end),
         [xvalue] "r" (xvalue)
       : "cc", "memory");
     to = aligned_end;           // setup for suffix
   }
   // Fill any partial word suffix.  Also the prefix if size < BytesPerWord.
   fill_subword(to, end, value);
 }
	/*
	* Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
	*
	* This code is free software; you can redistribute it and/or modify it
	* under the terms of the GNU General Public License version 2 only, as
	* published by the Free Software Foundation.
	*
	* This code is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	* version 2 for more details (a copy is included in the LICENSE file that
	* accompanied this code).
	*
	* You should have received a copy of the GNU General Public License version
	* 2 along with this work; if not, write to the Free Software Foundation,
	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	*
	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
	* or visit www.oracle.com if you need additional information or have any
	* questions.
	*
	*/

	#include "precompiled.hpp"

	#include "asm/macroAssembler.inline.hpp"
	#include "gc/shared/memset_with_concurrent_readers.hpp"
	#include "runtime/prefetch.inline.hpp"
	#include "utilities/align.hpp"
	#include "utilities/debug.hpp"
	#include "utilities/globalDefinitions.hpp"
	#include "utilities/macros.hpp"

	// An implementation of memset, for use when there may be concurrent
	// readers of the region being stored into.
	//
	// We can't use the standard library memset if it is implemented using
	// block initializing stores. Doing so can result in concurrent readers
	// seeing spurious zeros.
	//
	// We can't use the obvious C/C++ for-loop, because the compiler may
	// recognize the idiomatic loop and optimize it into a call to the
	// standard library memset; we've seen exactly this happen with, for
	// example, Solaris Studio 12.3. Hence the use of inline assembly
	// code, hiding loops from the compiler's optimizer.
	//
	// We don't attempt to use the standard library memset when it is safe
	// to do so. We could conservatively do so by detecting the presence
	// of block initializing stores (VM_Version::has_blk_init()), but the
	// implementation provided here should be sufficient.

	inline void fill_subword(void* start, void* end, int value) {
	STATIC_ASSERT(BytesPerWord == 8);
	assert(pointer_delta(end, start, 1) < (size_t)BytesPerWord, "precondition");
	// Dispatch on (end - start).
	void* pc;
	__asm__ volatile(
	// offset := (7 - (end - start)) + 3
	// 3 instructions from rdpc to DISPATCH
	" sub %[offset], %[end], %[offset]\n\t" // offset := start - end
	" sllx %[offset], 2, %[offset]\n\t" // scale offset for instruction size of 4
	" add %[offset], 40, %[offset]\n\t" // offset += 10 * instruction size
	" rd %%pc, %[pc]\n\t" // dispatch on scaled offset
	" jmpl %[pc]+%[offset], %%g0\n\t"
	" nop\n\t"
	// DISPATCH: no direct reference, but without it the store block may be elided.
	"1:\n\t"
	" stb %[value], [%[end]-7]\n\t" // end[-7] = value
	" stb %[value], [%[end]-6]\n\t"
	" stb %[value], [%[end]-5]\n\t"
	" stb %[value], [%[end]-4]\n\t"
	" stb %[value], [%[end]-3]\n\t"
	" stb %[value], [%[end]-2]\n\t"
	" stb %[value], [%[end]-1]\n\t" // end[-1] = value
	: /* only temporaries/overwritten outputs */
	[pc] "=&r" (pc), // temp
	[offset] "+&r" (start)
	: [end] "r" (end),
	[value] "r" (value)
	: "memory");
	}

	void memset_with_concurrent_readers(void* to, int value, size_t size) {
	Prefetch::write(to, 0);
	void* end = static_cast<char*>(to) + size;
	if (size >= (size_t)BytesPerWord) {
	// Fill any partial word prefix.
	uintx* aligned_to = static_cast<uintx*>(align_up(to, BytesPerWord));
	fill_subword(to, aligned_to, value);

	// Compute fill word.
	STATIC_ASSERT(BitsPerByte == 8);
	STATIC_ASSERT(BitsPerWord == 64);
	uintx xvalue = value & 0xff;
	xvalue \|= (xvalue << 8);
	xvalue \|= (xvalue << 16);
	xvalue \|= (xvalue << 32);

	uintx* aligned_end = static_cast<uintx*>(align_down(end, BytesPerWord));
	assert(aligned_to <= aligned_end, "invariant");

	// for ( ; aligned_to < aligned_end; ++aligned_to) {
	// *aligned_to = xvalue;
	// }
	uintptr_t temp;
	__asm__ volatile(
	// Unroll loop x8.
	" sub %[aend], %[ato], %[temp]\n\t"
	" cmp %[temp], 56\n\t" // cc := (aligned_end - aligned_to) > 7 words
	" ba %%xcc, 2f\n\t" // goto TEST always
	" sub %[aend], 56, %[temp]\n\t" // limit := aligned_end - 7 words
	// LOOP:
	"1:\n\t" // unrolled x8 store loop top
	" cmp %[temp], %[ato]\n\t" // cc := limit > (next) aligned_to
	" stx %[xvalue], [%[ato]-64]\n\t" // store 8 words, aligned_to pre-incremented
	" stx %[xvalue], [%[ato]-56]\n\t"
	" stx %[xvalue], [%[ato]-48]\n\t"
	" stx %[xvalue], [%[ato]-40]\n\t"
	" stx %[xvalue], [%[ato]-32]\n\t"
	" stx %[xvalue], [%[ato]-24]\n\t"
	" stx %[xvalue], [%[ato]-16]\n\t"
	" stx %[xvalue], [%[ato]-8]\n\t"
	// TEST:
	"2:\n\t"
	" bgu,a %%xcc, 1b\n\t" // goto LOOP if more than 7 words remaining
	" add %[ato], 64, %[ato]\n\t" // aligned_to += 8, for next iteration
	// Fill remaining < 8 full words.
	// Dispatch on (aligned_end - aligned_to).
	// offset := (7 - (aligned_end - aligned_to)) + 3
	// 3 instructions from rdpc to DISPATCH
	" sub %[ato], %[aend], %[ato]\n\t" // offset := aligned_to - aligned_end
	" srax %[ato], 1, %[ato]\n\t" // scale offset for instruction size of 4
	" add %[ato], 40, %[ato]\n\t" // offset += 10 * instruction size
	" rd %%pc, %[temp]\n\t" // dispatch on scaled offset
	" jmpl %[temp]+%[ato], %%g0\n\t"
	" nop\n\t"
	// DISPATCH: no direct reference, but without it the store block may be elided.
	"3:\n\t"
	" stx %[xvalue], [%[aend]-56]\n\t" // aligned_end[-7] = xvalue
	" stx %[xvalue], [%[aend]-48]\n\t"
	" stx %[xvalue], [%[aend]-40]\n\t"
	" stx %[xvalue], [%[aend]-32]\n\t"
	" stx %[xvalue], [%[aend]-24]\n\t"
	" stx %[xvalue], [%[aend]-16]\n\t"
	" stx %[xvalue], [%[aend]-8]\n\t" // aligned_end[-1] = xvalue
	: /* only temporaries/overwritten outputs */
	[temp] "=&r" (temp),
	[ato] "+&r" (aligned_to)
	: [aend] "r" (aligned_end),
	[xvalue] "r" (xvalue)
	: "cc", "memory");
	to = aligned_end; // setup for suffix
	}
	// Fill any partial word suffix. Also the prefix if size < BytesPerWord.
	fill_subword(to, end, value);
	}