starlark/profile.go - platform/external/starlark-go - Git at Google

 // Copyright 2019 The Bazel Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package starlark

 // This file defines a simple execution-time profiler for Starlark.
 // It measures the wall time spent executing Starlark code, and emits a
 // gzipped protocol message in pprof format (github.com/google/pprof).
 //
 // When profiling is enabled, the interpreter calls the profiler to
 // indicate the start and end of each "span" or time interval. A leaf
 // function (whether Go or Starlark) has a single span. A function that
 // calls another function has spans for each interval in which it is the
 // top of the stack. (A LOAD instruction also ends a span.)
 //
 // At the start of a span, the interpreter records the current time in
 // the thread's topmost frame. At the end of the span, it obtains the
 // time again and subtracts the span start time. The difference is added
 // to an accumulator variable in the thread. If the accumulator exceeds
 // some fixed quantum (10ms, say), the profiler records the current call
 // stack and sends it to the profiler goroutine, along with the number
 // of quanta, which are subtracted. For example, if the accumulator
 // holds 3ms and then a completed span adds 25ms to it, its value is 28ms,
 // which exceeeds 10ms. The profiler records a stack with the value 20ms
 // (2 quanta), and the accumulator is left with 8ms.
 //
 // The profiler goroutine converts the stacks into the pprof format and
 // emits a gzip-compressed protocol message to the designated output
 // file. We use a hand-written streaming proto encoder to avoid
 // dependencies on pprof and proto, and to avoid the need to
 // materialize the profile data structure in memory.
 //
 // A limitation of this profiler is that it measures wall time, which
 // does not necessarily correspond to CPU time. A CPU profiler requires
 // that only running (not runnable) threads are sampled; this is
 // commonly achieved by having the kernel deliver a (PROF) signal to an
 // arbitrary running thread, through setitimer(2). The CPU profiler in the
 // Go runtime uses this mechanism, but it is not possible for a Go
 // application to register a SIGPROF handler, nor is it possible for a
 // Go handler for some other signal to read the stack pointer of
 // the interrupted thread.
 //
 // Two caveats:
 // (1) it is tempting to send the leaf Frame directly to the profiler
 // goroutine instead of making a copy of the stack, since a Frame is a
 // spaghetti stack--a linked list. However, as soon as execution
 // resumes, the stack's Frame.pc values may be mutated, so Frames are
 // not safe to share with the asynchronous profiler goroutine.
 // (2) it is tempting to use Callables as keys in a map when tabulating
 // the pprof protocols's Function entities. However, we cannot assume
 // that Callables are valid map keys, and furthermore we must not
 // pin function values in memory indefinitely as this may cause lambda
 // values to keep their free variables live much longer than necessary.

 // TODO(adonovan):
 // - make Start/Stop fully thread-safe.
 // - fix the pc hack.
 // - experiment with other values of quantum.

 import (
 	"bufio"
 	"bytes"
 	"compress/gzip"
 	"encoding/binary"
 	"fmt"
 	"io"
 	"log"
 	"reflect"
 	"sync/atomic"
 	"time"
 	"unsafe"

 	"go.starlark.net/syntax"
 )

 // StartProfile enables time profiling of all Starlark threads,
 // and writes a profile in pprof format to w.
 // It must be followed by a call to StopProfiler to stop
 // the profiler and finalize the profile.
 //
 // StartProfile returns an error if profiling was already enabled.
 //
 // StartProfile must not be called concurrently with Starlark execution.
 func StartProfile(w io.Writer) error {
 	if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) {
 		return fmt.Errorf("profiler already running")
 	}

 	// TODO(adonovan): make the API fully concurrency-safe.
 	// The main challenge is racy reads/writes of profiler.events,
 	// and of send/close races on the channel it refers to.
 	// It's easy to solve them with a mutex but harder to do
 	// it efficiently.

 	profiler.events = make(chan *profEvent, 1)
 	profiler.done = make(chan error)

 	go profile(w)

 	return nil
 }

 // StopProfiler stops the profiler started by a prior call to
 // StartProfile and finalizes the profile. It returns an error if the
 // profile could not be completed.
 //
 // StopProfiler must not be called concurrently with Starlark execution.
 func StopProfile() error {
 	// Terminate the profiler goroutine and get its result.
 	close(profiler.events)
 	err := <-profiler.done

 	profiler.done = nil
 	profiler.events = nil
 	atomic.StoreUint32(&profiler.on, 0)

 	return err
 }

 // globals
 var profiler struct {
 	on     uint32          // nonzero => profiler running
 	events chan *profEvent // profile events from interpreter threads
 	done   chan error      // indicates profiler goroutine is ready
 }

 func (thread *Thread) beginProfSpan() {
 	if profiler.events == nil {
 		return // profiling not enabled
 	}

 	thread.frameAt(0).spanStart = nanotime()
 }

 // TODO(adonovan): experiment with smaller values,
 // which trade space and time for greater precision.
 const quantum = 10 * time.Millisecond

 func (thread *Thread) endProfSpan() {
 	if profiler.events == nil {
 		return // profiling not enabled
 	}

 	// Add the span to the thread's accumulator.
 	thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart)
 	if thread.proftime < quantum {
 		return
 	}

 	// Only record complete quanta.
 	n := thread.proftime / quantum
 	thread.proftime -= n * quantum

 	// Copy the stack.
 	// (We can't save thread.frame because its pc will change.)
 	ev := &profEvent{
 		thread: thread,
 		time:   n * quantum,
 	}
 	ev.stack = ev.stackSpace[:0]
 	for i := range thread.stack {
 		fr := thread.frameAt(i)
 		ev.stack = append(ev.stack, profFrame{
 			pos: fr.Position(),
 			fn:  fr.Callable(),
 			pc:  fr.pc,
 		})
 	}

 	profiler.events <- ev
 }

 type profEvent struct {
 	thread     *Thread // currently unused
 	time       time.Duration
 	stack      []profFrame
 	stackSpace [8]profFrame // initial space for stack
 }

 type profFrame struct {
 	fn  Callable        // don't hold this live for too long (prevents GC of lambdas)
 	pc  uint32          // program counter (Starlark frames only)
 	pos syntax.Position // position of pc within this frame
 }

 // profile is the profiler goroutine.
 // It runs until StopProfiler is called.
 func profile(w io.Writer) {
 	// Field numbers from pprof protocol.
 	// See https://github.com/google/pprof/blob/master/proto/profile.proto
 	const (
 		Profile_sample_type    = 1  // repeated ValueType
 		Profile_sample         = 2  // repeated Sample
 		Profile_mapping        = 3  // repeated Mapping
 		Profile_location       = 4  // repeated Location
 		Profile_function       = 5  // repeated Function
 		Profile_string_table   = 6  // repeated string
 		Profile_time_nanos     = 9  // int64
 		Profile_duration_nanos = 10 // int64
 		Profile_period_type    = 11 // ValueType
 		Profile_period         = 12 // int64

 		ValueType_type = 1 // int64
 		ValueType_unit = 2 // int64

 		Sample_location_id = 1 // repeated uint64
 		Sample_value       = 2 // repeated int64
 		Sample_label       = 3 // repeated Label

 		Label_key      = 1 // int64
 		Label_str      = 2 // int64
 		Label_num      = 3 // int64
 		Label_num_unit = 4 // int64

 		Location_id         = 1 // uint64
 		Location_mapping_id = 2 // uint64
 		Location_address    = 3 // uint64
 		Location_line       = 4 // repeated Line

 		Line_function_id = 1 // uint64
 		Line_line        = 2 // int64

 		Function_id          = 1 // uint64
 		Function_name        = 2 // int64
 		Function_system_name = 3 // int64
 		Function_filename    = 4 // int64
 		Function_start_line  = 5 // int64
 	)

 	bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks
 	gz := gzip.NewWriter(bufw)
 	enc := protoEncoder{w: gz}

 	// strings
 	stringIndex := make(map[string]int64)
 	str := func(s string) int64 {
 		i, ok := stringIndex[s]
 		if !ok {
 			i = int64(len(stringIndex))
 			enc.string(Profile_string_table, s)
 			stringIndex[s] = i
 		}
 		return i
 	}
 	str("") // entry 0

 	// functions
 	//
 	// function returns the ID of a Callable for use in Line.FunctionId.
 	// The ID is the same as the function's logical address,
 	// which is supplied by the caller to avoid the need to recompute it.
 	functionId := make(map[uintptr]uint64)
 	function := func(fn Callable, addr uintptr) uint64 {
 		id, ok := functionId[addr]
 		if !ok {
 			id = uint64(addr)

 			var pos syntax.Position
 			if fn, ok := fn.(callableWithPosition); ok {
 				pos = fn.Position()
 			}

 			name := fn.Name()
 			if name == "<toplevel>" {
 				name = pos.Filename()
 			}

 			nameIndex := str(name)

 			fun := new(bytes.Buffer)
 			funenc := protoEncoder{w: fun}
 			funenc.uint(Function_id, id)
 			funenc.int(Function_name, nameIndex)
 			funenc.int(Function_system_name, nameIndex)
 			funenc.int(Function_filename, str(pos.Filename()))
 			funenc.int(Function_start_line, int64(pos.Line))
 			enc.bytes(Profile_function, fun.Bytes())

 			functionId[addr] = id
 		}
 		return id
 	}

 	// locations
 	//
 	// location returns the ID of the location denoted by fr.
 	// For Starlark frames, this is the Frame pc.
 	locationId := make(map[uintptr]uint64)
 	location := func(fr profFrame) uint64 {
 		fnAddr := profFuncAddr(fr.fn)

 		// For Starlark functions, the frame position
 		// represents the current PC value.
 		// Mix it into the low bits of the address.
 		// This is super hacky and may result in collisions
 		// in large functions or if functions are numerous.
 		// TODO(adonovan): fix: try making this cleaner by treating
 		// each bytecode segment as a Profile.Mapping.
 		pcAddr := fnAddr
 		if _, ok := fr.fn.(*Function); ok {
 			pcAddr = (pcAddr << 16) ^ uintptr(fr.pc)
 		}

 		id, ok := locationId[pcAddr]
 		if !ok {
 			id = uint64(pcAddr)

 			line := new(bytes.Buffer)
 			lineenc := protoEncoder{w: line}
 			lineenc.uint(Line_function_id, function(fr.fn, fnAddr))
 			lineenc.int(Line_line, int64(fr.pos.Line))
 			loc := new(bytes.Buffer)
 			locenc := protoEncoder{w: loc}
 			locenc.uint(Location_id, id)
 			locenc.uint(Location_address, uint64(pcAddr))
 			locenc.bytes(Location_line, line.Bytes())
 			enc.bytes(Profile_location, loc.Bytes())

 			locationId[pcAddr] = id
 		}
 		return id
 	}

 	wallNanos := new(bytes.Buffer)
 	wnenc := protoEncoder{w: wallNanos}
 	wnenc.int(ValueType_type, str("wall"))
 	wnenc.int(ValueType_unit, str("nanoseconds"))

 	// informational fields of Profile
 	enc.bytes(Profile_sample_type, wallNanos.Bytes())
 	enc.int(Profile_period, quantum.Nanoseconds())     // magnitude of sampling period
 	enc.bytes(Profile_period_type, wallNanos.Bytes())  // dimension and unit of period
 	enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile

 	startNano := nanotime()

 	// Read profile events from the channel
 	// until it is closed by StopProfiler.
 	for e := range profiler.events {
 		sample := new(bytes.Buffer)
 		sampleenc := protoEncoder{w: sample}
 		sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds
 		for _, fr := range e.stack {
 			sampleenc.uint(Sample_location_id, location(fr))
 		}
 		enc.bytes(Profile_sample, sample.Bytes())
 	}

 	endNano := nanotime()
 	enc.int(Profile_duration_nanos, endNano-startNano)

 	err := gz.Close() // Close reports any prior write error
 	if flushErr := bufw.Flush(); err == nil {
 		err = flushErr
 	}
 	profiler.done <- err
 }

 // nanotime returns the time in nanoseconds since epoch.
 // It is implemented by runtime.nanotime using the linkname hack;
 // runtime.nanotime is defined for all OSs/ARCHS and uses the
 // monotonic system clock, which there is no portable way to access.
 // Should that function ever go away, these alternatives exist:
 //
 // 	// POSIX only. REALTIME not MONOTONIC. 17ns.
 // 	var tv syscall.Timeval
 // 	syscall.Gettimeofday(&tv) // can't fail
 // 	return tv.Nano()
 //
 // 	// Portable. REALTIME not MONOTONIC. 46ns.
 // 	return time.Now().Nanoseconds()
 //
 //      // POSIX only. Adds a dependency.
 //	import "golang.org/x/sys/unix"
 //	var ts unix.Timespec
 // 	unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail
 //	return unix.TimespecToNsec(ts)
 //
 //go:linkname nanotime runtime.nanotime
 func nanotime() int64

 // profFuncAddr returns the canonical "address"
 // of a Callable for use by the profiler.
 func profFuncAddr(fn Callable) uintptr {
 	switch fn := fn.(type) {
 	case *Builtin:
 		return reflect.ValueOf(fn.fn).Pointer()
 	case *Function:
 		return uintptr(unsafe.Pointer(fn.funcode))
 	}

 	// User-defined callable types are typically of
 	// of kind pointer-to-struct. Handle them specially.
 	if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr {
 		return v.Pointer()
 	}

 	// Address zero is reserved by the protocol.
 	// Use 1 for callables we don't recognize.
 	log.Printf("Starlark profiler: no address for Callable %T", fn)
 	return 1
 }

 // We encode the protocol message by hand to avoid making
 // the interpreter depend on both github.com/google/pprof
 // and github.com/golang/protobuf.
 //
 // This also avoids the need to materialize a protocol message object
 // tree of unbounded size and serialize it all at the end.
 // The pprof format appears to have been designed to
 // permit streaming implementations such as this one.
 //
 // See https://developers.google.com/protocol-buffers/docs/encoding.
 type protoEncoder struct {
 	w   io.Writer // *bytes.Buffer or *gzip.Writer
 	tmp [binary.MaxVarintLen64]byte
 }

 func (e *protoEncoder) uvarint(x uint64) {
 	n := binary.PutUvarint(e.tmp[:], x)
 	e.w.Write(e.tmp[:n])
 }

 func (e *protoEncoder) tag(field, wire uint) {
 	e.uvarint(uint64(field<<3 | wire))
 }

 func (e *protoEncoder) string(field uint, s string) {
 	e.tag(field, 2) // length-delimited
 	e.uvarint(uint64(len(s)))
 	io.WriteString(e.w, s)
 }

 func (e *protoEncoder) bytes(field uint, b []byte) {
 	e.tag(field, 2) // length-delimited
 	e.uvarint(uint64(len(b)))
 	e.w.Write(b)
 }

 func (e *protoEncoder) uint(field uint, x uint64) {
 	e.tag(field, 0) // varint
 	e.uvarint(x)
 }

 func (e *protoEncoder) int(field uint, x int64) {
 	e.tag(field, 0) // varint
 	e.uvarint(uint64(x))
 }
	// Copyright 2019 The Bazel Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package starlark

	// This file defines a simple execution-time profiler for Starlark.
	// It measures the wall time spent executing Starlark code, and emits a
	// gzipped protocol message in pprof format (github.com/google/pprof).
	//
	// When profiling is enabled, the interpreter calls the profiler to
	// indicate the start and end of each "span" or time interval. A leaf
	// function (whether Go or Starlark) has a single span. A function that
	// calls another function has spans for each interval in which it is the
	// top of the stack. (A LOAD instruction also ends a span.)
	//
	// At the start of a span, the interpreter records the current time in
	// the thread's topmost frame. At the end of the span, it obtains the
	// time again and subtracts the span start time. The difference is added
	// to an accumulator variable in the thread. If the accumulator exceeds
	// some fixed quantum (10ms, say), the profiler records the current call
	// stack and sends it to the profiler goroutine, along with the number
	// of quanta, which are subtracted. For example, if the accumulator
	// holds 3ms and then a completed span adds 25ms to it, its value is 28ms,
	// which exceeeds 10ms. The profiler records a stack with the value 20ms
	// (2 quanta), and the accumulator is left with 8ms.
	//
	// The profiler goroutine converts the stacks into the pprof format and
	// emits a gzip-compressed protocol message to the designated output
	// file. We use a hand-written streaming proto encoder to avoid
	// dependencies on pprof and proto, and to avoid the need to
	// materialize the profile data structure in memory.
	//
	// A limitation of this profiler is that it measures wall time, which
	// does not necessarily correspond to CPU time. A CPU profiler requires
	// that only running (not runnable) threads are sampled; this is
	// commonly achieved by having the kernel deliver a (PROF) signal to an
	// arbitrary running thread, through setitimer(2). The CPU profiler in the
	// Go runtime uses this mechanism, but it is not possible for a Go
	// application to register a SIGPROF handler, nor is it possible for a
	// Go handler for some other signal to read the stack pointer of
	// the interrupted thread.
	//
	// Two caveats:
	// (1) it is tempting to send the leaf Frame directly to the profiler
	// goroutine instead of making a copy of the stack, since a Frame is a
	// spaghetti stack--a linked list. However, as soon as execution
	// resumes, the stack's Frame.pc values may be mutated, so Frames are
	// not safe to share with the asynchronous profiler goroutine.
	// (2) it is tempting to use Callables as keys in a map when tabulating
	// the pprof protocols's Function entities. However, we cannot assume
	// that Callables are valid map keys, and furthermore we must not
	// pin function values in memory indefinitely as this may cause lambda
	// values to keep their free variables live much longer than necessary.

	// TODO(adonovan):
	// - make Start/Stop fully thread-safe.
	// - fix the pc hack.
	// - experiment with other values of quantum.

	import (
	"bufio"
	"bytes"
	"compress/gzip"
	"encoding/binary"
	"fmt"
	"io"
	"log"
	"reflect"
	"sync/atomic"
	"time"
	"unsafe"

	"go.starlark.net/syntax"
	)

	// StartProfile enables time profiling of all Starlark threads,
	// and writes a profile in pprof format to w.
	// It must be followed by a call to StopProfiler to stop
	// the profiler and finalize the profile.
	//
	// StartProfile returns an error if profiling was already enabled.
	//
	// StartProfile must not be called concurrently with Starlark execution.
	func StartProfile(w io.Writer) error {
	if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) {
	return fmt.Errorf("profiler already running")
	}

	// TODO(adonovan): make the API fully concurrency-safe.
	// The main challenge is racy reads/writes of profiler.events,
	// and of send/close races on the channel it refers to.
	// It's easy to solve them with a mutex but harder to do
	// it efficiently.

	profiler.events = make(chan *profEvent, 1)
	profiler.done = make(chan error)

	go profile(w)

	return nil
	}

	// StopProfiler stops the profiler started by a prior call to
	// StartProfile and finalizes the profile. It returns an error if the
	// profile could not be completed.
	//
	// StopProfiler must not be called concurrently with Starlark execution.
	func StopProfile() error {
	// Terminate the profiler goroutine and get its result.
	close(profiler.events)
	err := <-profiler.done

	profiler.done = nil
	profiler.events = nil
	atomic.StoreUint32(&profiler.on, 0)

	return err
	}

	// globals
	var profiler struct {
	on uint32 // nonzero => profiler running
	events chan *profEvent // profile events from interpreter threads
	done chan error // indicates profiler goroutine is ready
	}

	func (thread *Thread) beginProfSpan() {
	if profiler.events == nil {
	return // profiling not enabled
	}

	thread.frameAt(0).spanStart = nanotime()
	}

	// TODO(adonovan): experiment with smaller values,
	// which trade space and time for greater precision.
	const quantum = 10 * time.Millisecond

	func (thread *Thread) endProfSpan() {
	if profiler.events == nil {
	return // profiling not enabled
	}

	// Add the span to the thread's accumulator.
	thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart)
	if thread.proftime < quantum {
	return
	}

	// Only record complete quanta.
	n := thread.proftime / quantum
	thread.proftime -= n * quantum

	// Copy the stack.
	// (We can't save thread.frame because its pc will change.)
	ev := &profEvent{
	thread: thread,
	time: n * quantum,
	}
	ev.stack = ev.stackSpace[:0]
	for i := range thread.stack {
	fr := thread.frameAt(i)
	ev.stack = append(ev.stack, profFrame{
	pos: fr.Position(),
	fn: fr.Callable(),
	pc: fr.pc,
	})
	}

	profiler.events <- ev
	}

	type profEvent struct {
	thread *Thread // currently unused
	time time.Duration
	stack []profFrame
	stackSpace [8]profFrame // initial space for stack
	}

	type profFrame struct {
	fn Callable // don't hold this live for too long (prevents GC of lambdas)
	pc uint32 // program counter (Starlark frames only)
	pos syntax.Position // position of pc within this frame
	}

	// profile is the profiler goroutine.
	// It runs until StopProfiler is called.
	func profile(w io.Writer) {
	// Field numbers from pprof protocol.
	// See https://github.com/google/pprof/blob/master/proto/profile.proto
	const (
	Profile_sample_type = 1 // repeated ValueType
	Profile_sample = 2 // repeated Sample
	Profile_mapping = 3 // repeated Mapping
	Profile_location = 4 // repeated Location
	Profile_function = 5 // repeated Function
	Profile_string_table = 6 // repeated string
	Profile_time_nanos = 9 // int64
	Profile_duration_nanos = 10 // int64
	Profile_period_type = 11 // ValueType
	Profile_period = 12 // int64

	ValueType_type = 1 // int64
	ValueType_unit = 2 // int64

	Sample_location_id = 1 // repeated uint64
	Sample_value = 2 // repeated int64
	Sample_label = 3 // repeated Label

	Label_key = 1 // int64
	Label_str = 2 // int64
	Label_num = 3 // int64
	Label_num_unit = 4 // int64

	Location_id = 1 // uint64
	Location_mapping_id = 2 // uint64
	Location_address = 3 // uint64
	Location_line = 4 // repeated Line

	Line_function_id = 1 // uint64
	Line_line = 2 // int64

	Function_id = 1 // uint64
	Function_name = 2 // int64
	Function_system_name = 3 // int64
	Function_filename = 4 // int64
	Function_start_line = 5 // int64
	)

	bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks
	gz := gzip.NewWriter(bufw)
	enc := protoEncoder{w: gz}

	// strings
	stringIndex := make(map[string]int64)
	str := func(s string) int64 {
	i, ok := stringIndex[s]
	if !ok {
	i = int64(len(stringIndex))
	enc.string(Profile_string_table, s)
	stringIndex[s] = i
	}
	return i
	}
	str("") // entry 0

	// functions
	//
	// function returns the ID of a Callable for use in Line.FunctionId.
	// The ID is the same as the function's logical address,
	// which is supplied by the caller to avoid the need to recompute it.
	functionId := make(map[uintptr]uint64)
	function := func(fn Callable, addr uintptr) uint64 {
	id, ok := functionId[addr]
	if !ok {
	id = uint64(addr)

	var pos syntax.Position
	if fn, ok := fn.(callableWithPosition); ok {
	pos = fn.Position()
	}

	name := fn.Name()
	if name == "<toplevel>" {
	name = pos.Filename()
	}

	nameIndex := str(name)

	fun := new(bytes.Buffer)
	funenc := protoEncoder{w: fun}
	funenc.uint(Function_id, id)
	funenc.int(Function_name, nameIndex)
	funenc.int(Function_system_name, nameIndex)
	funenc.int(Function_filename, str(pos.Filename()))
	funenc.int(Function_start_line, int64(pos.Line))
	enc.bytes(Profile_function, fun.Bytes())

	functionId[addr] = id
	}
	return id
	}

	// locations
	//
	// location returns the ID of the location denoted by fr.
	// For Starlark frames, this is the Frame pc.
	locationId := make(map[uintptr]uint64)
	location := func(fr profFrame) uint64 {
	fnAddr := profFuncAddr(fr.fn)

	// For Starlark functions, the frame position
	// represents the current PC value.
	// Mix it into the low bits of the address.
	// This is super hacky and may result in collisions
	// in large functions or if functions are numerous.
	// TODO(adonovan): fix: try making this cleaner by treating
	// each bytecode segment as a Profile.Mapping.
	pcAddr := fnAddr
	if _, ok := fr.fn.(*Function); ok {
	pcAddr = (pcAddr << 16) ^ uintptr(fr.pc)
	}

	id, ok := locationId[pcAddr]
	if !ok {
	id = uint64(pcAddr)

	line := new(bytes.Buffer)
	lineenc := protoEncoder{w: line}
	lineenc.uint(Line_function_id, function(fr.fn, fnAddr))
	lineenc.int(Line_line, int64(fr.pos.Line))
	loc := new(bytes.Buffer)
	locenc := protoEncoder{w: loc}
	locenc.uint(Location_id, id)
	locenc.uint(Location_address, uint64(pcAddr))
	locenc.bytes(Location_line, line.Bytes())
	enc.bytes(Profile_location, loc.Bytes())

	locationId[pcAddr] = id
	}
	return id
	}

	wallNanos := new(bytes.Buffer)
	wnenc := protoEncoder{w: wallNanos}
	wnenc.int(ValueType_type, str("wall"))
	wnenc.int(ValueType_unit, str("nanoseconds"))

	// informational fields of Profile
	enc.bytes(Profile_sample_type, wallNanos.Bytes())
	enc.int(Profile_period, quantum.Nanoseconds()) // magnitude of sampling period
	enc.bytes(Profile_period_type, wallNanos.Bytes()) // dimension and unit of period
	enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile

	startNano := nanotime()

	// Read profile events from the channel
	// until it is closed by StopProfiler.
	for e := range profiler.events {
	sample := new(bytes.Buffer)
	sampleenc := protoEncoder{w: sample}
	sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds
	for _, fr := range e.stack {
	sampleenc.uint(Sample_location_id, location(fr))
	}
	enc.bytes(Profile_sample, sample.Bytes())
	}

	endNano := nanotime()
	enc.int(Profile_duration_nanos, endNano-startNano)

	err := gz.Close() // Close reports any prior write error
	if flushErr := bufw.Flush(); err == nil {
	err = flushErr
	}
	profiler.done <- err
	}

	// nanotime returns the time in nanoseconds since epoch.
	// It is implemented by runtime.nanotime using the linkname hack;
	// runtime.nanotime is defined for all OSs/ARCHS and uses the
	// monotonic system clock, which there is no portable way to access.
	// Should that function ever go away, these alternatives exist:
	//
	// // POSIX only. REALTIME not MONOTONIC. 17ns.
	// var tv syscall.Timeval
	// syscall.Gettimeofday(&tv) // can't fail
	// return tv.Nano()
	//
	// // Portable. REALTIME not MONOTONIC. 46ns.
	// return time.Now().Nanoseconds()
	//
	// // POSIX only. Adds a dependency.
	// import "golang.org/x/sys/unix"
	// var ts unix.Timespec
	// unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail
	// return unix.TimespecToNsec(ts)
	//
	//go:linkname nanotime runtime.nanotime
	func nanotime() int64

	// profFuncAddr returns the canonical "address"
	// of a Callable for use by the profiler.
	func profFuncAddr(fn Callable) uintptr {
	switch fn := fn.(type) {
	case *Builtin:
	return reflect.ValueOf(fn.fn).Pointer()
	case *Function:
	return uintptr(unsafe.Pointer(fn.funcode))
	}

	// User-defined callable types are typically of
	// of kind pointer-to-struct. Handle them specially.
	if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr {
	return v.Pointer()
	}

	// Address zero is reserved by the protocol.
	// Use 1 for callables we don't recognize.
	log.Printf("Starlark profiler: no address for Callable %T", fn)
	return 1
	}

	// We encode the protocol message by hand to avoid making
	// the interpreter depend on both github.com/google/pprof
	// and github.com/golang/protobuf.
	//
	// This also avoids the need to materialize a protocol message object
	// tree of unbounded size and serialize it all at the end.
	// The pprof format appears to have been designed to
	// permit streaming implementations such as this one.
	//
	// See https://developers.google.com/protocol-buffers/docs/encoding.
	type protoEncoder struct {
	w io.Writer // bytes.Buffer or gzip.Writer
	tmp [binary.MaxVarintLen64]byte
	}

	func (e *protoEncoder) uvarint(x uint64) {
	n := binary.PutUvarint(e.tmp[:], x)
	e.w.Write(e.tmp[:n])
	}

	func (e *protoEncoder) tag(field, wire uint) {
	e.uvarint(uint64(field<<3 \| wire))
	}

	func (e *protoEncoder) string(field uint, s string) {
	e.tag(field, 2) // length-delimited
	e.uvarint(uint64(len(s)))
	io.WriteString(e.w, s)
	}

	func (e *protoEncoder) bytes(field uint, b []byte) {
	e.tag(field, 2) // length-delimited
	e.uvarint(uint64(len(b)))
	e.w.Write(b)
	}

	func (e *protoEncoder) uint(field uint, x uint64) {
	e.tag(field, 0) // varint
	e.uvarint(x)
	}

	func (e *protoEncoder) int(field uint, x int64) {
	e.tag(field, 0) // varint
	e.uvarint(uint64(x))
	}