tensorflow/compiler/xla/service/memory_space_assignment.h - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_

 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"

 namespace xla {

 // This class contains pre-set assignments determined by memory space
 // assignment. It contains two data structures: (1) a chunks vector that maps a
 // defining HloPosition to a Chunk (offset and size), and (2) a sizes vector
 // that maps the memory space to its size. If there is only one alternate memory
 // space like there is currently, there will be one entry in sizes.
 class PresetAssignments {
  public:
   PresetAssignments() = default;

   void add_chunk(const HloPosition& position,
                  const HeapSimulator::Chunk& chunk) {
     chunks_.emplace_back(position, chunk);
   }

   void add_size(int64 memory_space, int64 size) {
     sizes_.emplace_back(memory_space, size);
   }

   absl::Span<const std::pair<const HloPosition, const HeapSimulator::Chunk>>
   chunks() const {
     return chunks_;
   }

   absl::Span<const std::pair<int64, int64>> sizes() const { return sizes_; }

  private:
   std::vector<std::pair<const HloPosition, const HeapSimulator::Chunk>> chunks_;
   std::vector<std::pair<int64, int64>> sizes_;
 };

 // MemorySpaceAssignment assigns memory spaces (default or alternate) to each
 // instruction in the module. It will greedily try placing as as many values in
 // the alternate memory space as possible. It uses the heap simulator to
 // determine the actual allocation offsets of values in the alternate memory
 // space to account for fragmentation. The default memory space is assumed to be
 // large enough to hold the values that could not be placed in the alternate
 // memory space.
 class MemorySpaceAssignment {
  public:
   using Chunk = HeapSimulator::Chunk;

   // MemorySpaceAssignment uses a notion of a slow and large default memory
   // space and a fast and small alternate memory space.
   enum class MemorySpace { kDefault, kAlternate };

   // This class represents an allocation that might either be in the default or
   // alternate memory. An HloValue might live in multiple different allocations
   // over its lifetime. The lifetimes of the allocations are defined using
   // start_time and end_time, which corresponds to the instruction indexes in
   // the flattened schedule. Each of these allocations might partially overlap
   // with each other. CopyAllocation defined below represents asynchronous
   // copies between Allocations.
   //
   // Consider an instruction Foo, and its users Bar and Baz, and the times given
   // in terms of the flattened schedule of the entire module:
   //
   //      Foo:10
   //       /   \
   //    Bar:14  \
   //           Baz:25
   //
   // A valid memory space assignment could be like the following:
   //
   //  Time:         10 ... 14        ...      25
   //                Foo    Bar                Baz
   //  Alternate     +-------+           +-----+
   //  Default           +---------------------+
   //                    ^   ^           ^     ^
   //                    |   |           |     |
   //                evict   evict  prefetch  prefetch
   //                start    end    start      end
   //
   // This would be represented with:
   //   - Allocation(memory_space=kAlternate, start_time=10, end_time=14)
   //   - CopyAllocation(memory_space=kDefault, start_time=12, end_time=25)
   //   - CopyAllocation(memory_space=kAlternate, start_time=22, end_time=25)
   class Allocation {
    public:
     Allocation(HloInstruction* instruction, HloPosition defining_position,
                MemorySpace memory_space, Chunk chunk, int64 start_time,
                int64 end_time)
         : instruction_(instruction),
           defining_position_(defining_position),
           memory_space_(memory_space),
           chunk_(chunk),
           start_time_(start_time),
           end_time_(end_time) {}
     virtual ~Allocation() = default;

     virtual bool is_copy_allocation() const { return false; }

     // Adds a use to this allocation.
     void AddUse(HloUse use);

     // Extends the end time of this allocation.
     void Extend(int64 end_time) { end_time_ = end_time; }

     // After all of the time ranges for the allocations have been assigned,
     // Process morphs the instructions affected to assign the memory spaces and
     // insert asynchronous copy instructions if necessary.
     virtual Status Process(MemorySpaceAssignment* memory_space_assignment);

     // Returns the instruction that produces this allocation. It might be
     // different than the instruction in defining_position (e.g., a
     // GetTupleElement instruction does not define the buffer).
     virtual HloInstruction* instruction() const { return instruction_; }

     // Returns the defining position for this allocation.
     HloPosition defining_position() const { return defining_position_; }

     const std::vector<HloUse>& uses() const { return uses_; }
     MemorySpace memory_space() const { return memory_space_; }
     Chunk chunk() const { return chunk_; }
     void set_start_time(int64 start_time) { start_time_ = start_time; }
     int64 start_time() const { return start_time_; }
     int64 end_time() const { return end_time_; }

    protected:
     HloInstruction* instruction_;
     HloPosition defining_position_;
     std::vector<HloUse> uses_;
     std::vector<HloInstruction*> bitcasts_;
     MemorySpace memory_space_;
     Chunk chunk_;
     int64 start_time_;
     int64 end_time_;
   };

   // This class represents an allocation as a result of an asynchronous copy.
   class CopyAllocation : public Allocation {
    public:
     CopyAllocation(const Allocation& prev_allocation, MemorySpace memory_space,
                    Chunk chunk, int64 start_time, int64 end_time)
         : Allocation(/*instruction=*/nullptr,
                      prev_allocation.defining_position(), memory_space, chunk,
                      start_time, end_time),
           prev_allocation_(prev_allocation),
           copy_start_schedule_after_(start_time),
           copy_done_schedule_before_(end_time) {}

     bool is_copy_allocation() const override { return true; }

     Status Process(MemorySpaceAssignment* memory_space_assignment) override;

     HloInstruction* instruction() const override {
       // Unless explicitly set, the instruction of a copy allocation in
       // retrieved from the previous allocation.
       if (instruction_ != nullptr) {
         return instruction_;
       } else {
         return prev_allocation_.instruction();
       }
     }

     HloInstruction* copy_start() const { return copy_start_; }
     HloInstruction* copy_done() const { return copy_done_; }

     int64 copy_start_schedule_after() const {
       return copy_start_schedule_after_;
     }
     int64 copy_done_schedule_before() const {
       return copy_done_schedule_before_;
     }

     void set_copy_start_schedule_after(int64 copy_start_schedule_after) {
       copy_start_schedule_after_ = copy_start_schedule_after;
     }

    private:
     const Allocation& prev_allocation_;
     // These variables define the scheduling boundaries where CopyStart and
     // CopyDone can be scheduled. The earliest CopyStart can be scheduled is
     // after copy_start_schedule_after_ and the latest CopyDone can be scheduled
     // is before copy_done_schedule_before_.
     int64 copy_start_schedule_after_;
     int64 copy_done_schedule_before_;
     HloInstruction* copy_start_;
     HloInstruction* copy_done_;
   };

   using AllocationSequence = std::list<std::unique_ptr<Allocation>>;
   using AllocationMap =
       absl::flat_hash_map<const HloBuffer*, AllocationSequence>;

   // Runs the MemorySpaceAssignment pass. alternate_memory_space is the
   // architecture-specific integer value that describes the alternate memory.
   // max_size_in_bytes is the maximum size of the alternate memory.
   // min/max_prefetch_interval define min/max number of independent instructions
   // that can be overlapped while prefetching to decide how early can prefetch
   // begin. alternate_memory_space_alignment_in_bytes is the alignment required
   // in the alternate memory space, size_fn is the size function for buffer
   // values, and is_allowed_in_alternate_mem can be used to prevent certain
   // HloValues (e.g., based on the opcode) to be placed on the alternate memory.
   // max_outstanding_async_copies specifies the upper bound for number of
   // outstanding asynchronous copies, -1 for unlimited.
   // TODO(berkin): Use the cost model instead of using number of instructions to
   // decide how early to prefetch.
   static StatusOr<std::unique_ptr<PresetAssignments>> Run(
       HloModule* module, int64 alternate_memory_space, int64 max_size_in_bytes,
       int64 min_prefetch_interval, int64 max_prefetch_interval,
       int64 alternate_memory_space_alignment_in_bytes,
       BufferValue::SizeFunction size_fn,
       std::function<bool(const HloValue&)> is_allowed_in_alternate_mem,
       int64 max_outstanding_async_copies = -1);

   // Returns the maximum number of outstanding asynchronous copies in the
   // module.
   static int64 CountMaximumOutstandingAsyncCopies(const HloModule& module);

  private:
   MemorySpaceAssignment(HloModule* module, int64 alternate_memory_space)
       : module_(module),
         alternate_memory_space_(alternate_memory_space),
         preset_assignments_(absl::make_unique<PresetAssignments>()) {}

   // Process calls Process methods of the allocations after the allocations have
   // been finalized.
   Status Process();

   // FixSchedule inserts asynchronous copies in the schedule.
   Status FixSchedule();

   // Insert an instruction to the schedule, and make sure its dependencies
   // (operands) are already in the schedule. If not, insert these operands
   // before the instruction.
   void EnsureInstructionAndOperandsInserted(
       HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
       absl::flat_hash_set<HloInstruction*>* inserted_instructions) const;

   // Schedules asynchronous copies and ensures that the CopyStarts and their
   // corresponding CopyDones follow the same order.
   void ScheduleAsynchronousCopies();

   // Add the position to the pending positions that will be colored as alternate
   // memory.
   void AddPositionInAlternateMemorySpace(HloPosition position);

   HloModule* module_;
   int64 alternate_memory_space_;
   std::unique_ptr<HloLiveRange> hlo_live_range_;
   AllocationMap allocation_map_;
   std::unique_ptr<PresetAssignments> preset_assignments_;

   // These maps hold vectors of new instructions that need to be scheduled after
   // (or before) the instruction index in the key. FixSchedule uses these maps
   // to modify and fix the schedule.
   absl::flat_hash_map<int64, std::vector<HloInstruction*>> schedule_after_;
   absl::flat_hash_map<int64, std::vector<HloInstruction*>> schedule_before_;
   std::vector<HloPosition> pending_positions_in_alternate_mem_;
 };

 // This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
 // maximum size.
 class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
  public:
   using IsAllowedInAlternateMemoryFunction =
       std::function<bool(const HloValue&)>;
   using MemorySpace = MemorySpaceAssignment::MemorySpace;

   AlternateMemoryBestFitHeap(
       MemorySpaceAssignment::AllocationMap* allocation_map,
       int64 max_size_in_bytes, int64 min_prefetch_interval,
       int64 max_prefetch_interval, const HloAliasAnalysis& alias_analysis,
       const HloLiveRange& hlo_live_range, int64 alignment,
       GlobalDecreasingSizeBestFitHeap::Type type,
       IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem,
       int64 max_outstanding_async_copies)
       : GlobalDecreasingSizeBestFitHeap(alignment, type),
         allocation_map_(allocation_map),
         max_size_in_bytes_(max_size_in_bytes),
         min_prefetch_interval_(min_prefetch_interval),
         max_prefetch_interval_(max_prefetch_interval),
         alias_analysis_(alias_analysis),
         hlo_live_range_(hlo_live_range),
         is_allowed_in_alternate_mem_(is_allowed_in_alternate_mem),
         max_outstanding_async_copies_(max_outstanding_async_copies) {}

   HeapSimulator::Result Finish() override;

  private:
   // Finds an allocation for the given interval. Internally, it will attempt to
   // find a suitable chunk candidate within the heap size and prefetch interval
   // limits, and append the new allocation(s) to allocations. The new
   // allocations can be in default or alternate memory spaces, or can be
   // prefetches or evictions. Returns true if successful.
   bool FindAllocation(int64 start_time, int64 end_time, int64 last_use_time,
                       HloPosition defining_position, HloUse use,
                       const HloValue* buffer, int64 size,
                       MemorySpaceAssignment::AllocationSequence* allocations);

   // Try allocating in alternate memory without any copies. Returns true if
   // successful.
   bool TryAllocatingInAlternateMemoryNoCopy(
       int64 start_time, int64 end_time, int64 last_use_time,
       HloPosition defining_position, HloUse use,
       BufferInterval alternate_mem_interval,
       HloInstruction* non_bitcast_operand,
       MemorySpaceAssignment::AllocationSequence* allocations);

   // Given a buffer interval, returns the colocated intervals. Unlike the
   // similar GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations, it
   // returns the colocated intervals sorted by scheduled time.
   std::vector<const BufferInterval*> GetSortedColocatedIntervals(
       const BufferInterval& interval) const;

   // Since the allocations are recorded to the AllocationMap, we don't maintain
   // result_ in GlobalDecreasingSizeBestFitHeap. Override AddToChunkMap to avoid
   // unnecessarily adding the chunk to the chunk map.
   void AddToChunkMap(const HloValue* buffer, Chunk chunk) override {}

   // Returns true if the addition of an asynchronous copy in the given time
   // interval would violate the maximum number of asynchronous copies.
   bool ViolatesMaximumOutstandingAsyncCopies(int64 start_time,
                                              int64 end_time) const;

   // Adds an asynchronous copy to the allocations.
   void AddAsyncCopy(const MemorySpaceAssignment::Allocation& prev_allocation,
                     MemorySpace memory_space, Chunk chunk, int64 start_time,
                     int64 end_time,
                     MemorySpaceAssignment::AllocationSequence* allocations);

   // These methods are used for delaying committing the chunk candidate until
   // the entire live range of the buffer has been considered.
   void AddToPendingChunks(const BufferInterval& buffer_interval,
                           const ChunkCandidate& chunk_candidate);
   void CommitPendingChunks();

   MemorySpaceAssignment::AllocationMap* allocation_map_;
   int64 max_size_in_bytes_;
   // The min and max prefetch intervals decribe the number of independent HLOs
   // overlapped while a value is being prefetched into the alternate memory
   // (between CopyStart and CopyDone HLO instructions). max_prefetch_interval
   // attempts to prevent bringing tensors into the alternate memory too eagerly
   // and hence occupying the space for other tensors which might use it.
   // min_prefetch_interval attempts to prevent cases where tensors are
   // prefetched into the alternate memory without sufficient time for the copy
   // to take place. In those cases, it's just better to keep the tensor in the
   // default memory instead of hurting the critical path with this copy that
   // likely won't finish in time.
   // TODO(berkin): Explore heuristics that take into account the cost of copying
   // tensors between alternate and default memories.
   int64 min_prefetch_interval_;
   int64 max_prefetch_interval_;
   const HloAliasAnalysis& alias_analysis_;
   const HloLiveRange& hlo_live_range_;
   IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem_;
   // We use a interval tree to keep track of the number of outstanding
   // asynchronous copies.
   BufferIntervalTree async_copy_interval_tree_;
   int64 max_outstanding_async_copies_;
   std::vector<std::pair<BufferInterval, ChunkCandidate>> pending_chunks_;
   std::vector<std::pair<int64, int64>> pending_async_copies_;
 };

 }  // namespace xla

 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
	#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_

	#include "tensorflow/compiler/xla/service/heap_simulator.h"
	#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"

	namespace xla {

	// This class contains pre-set assignments determined by memory space
	// assignment. It contains two data structures: (1) a chunks vector that maps a
	// defining HloPosition to a Chunk (offset and size), and (2) a sizes vector
	// that maps the memory space to its size. If there is only one alternate memory
	// space like there is currently, there will be one entry in sizes.
	class PresetAssignments {
	public:
	PresetAssignments() = default;

	void add_chunk(const HloPosition& position,
	const HeapSimulator::Chunk& chunk) {
	chunks_.emplace_back(position, chunk);
	}

	void add_size(int64 memory_space, int64 size) {
	sizes_.emplace_back(memory_space, size);
	}

	absl::Span<const std::pair<const HloPosition, const HeapSimulator::Chunk>>
	chunks() const {
	return chunks_;
	}

	absl::Span<const std::pair<int64, int64>> sizes() const { return sizes_; }

	private:
	std::vector<std::pair<const HloPosition, const HeapSimulator::Chunk>> chunks_;
	std::vector<std::pair<int64, int64>> sizes_;
	};

	// MemorySpaceAssignment assigns memory spaces (default or alternate) to each
	// instruction in the module. It will greedily try placing as as many values in
	// the alternate memory space as possible. It uses the heap simulator to
	// determine the actual allocation offsets of values in the alternate memory
	// space to account for fragmentation. The default memory space is assumed to be
	// large enough to hold the values that could not be placed in the alternate
	// memory space.
	class MemorySpaceAssignment {
	public:
	using Chunk = HeapSimulator::Chunk;

	// MemorySpaceAssignment uses a notion of a slow and large default memory
	// space and a fast and small alternate memory space.
	enum class MemorySpace { kDefault, kAlternate };

	// This class represents an allocation that might either be in the default or
	// alternate memory. An HloValue might live in multiple different allocations
	// over its lifetime. The lifetimes of the allocations are defined using
	// start_time and end_time, which corresponds to the instruction indexes in
	// the flattened schedule. Each of these allocations might partially overlap
	// with each other. CopyAllocation defined below represents asynchronous
	// copies between Allocations.
	//
	// Consider an instruction Foo, and its users Bar and Baz, and the times given
	// in terms of the flattened schedule of the entire module:
	//
	// Foo:10
	// / \
	// Bar:14 \
	// Baz:25
	//
	// A valid memory space assignment could be like the following:
	//
	// Time: 10 ... 14 ... 25
	// Foo Bar Baz
	// Alternate +-------+ +-----+
	// Default +---------------------+
	// ^ ^ ^ ^
	// \| \| \| \|
	// evict evict prefetch prefetch
	// start end start end
	//
	// This would be represented with:
	// - Allocation(memory_space=kAlternate, start_time=10, end_time=14)
	// - CopyAllocation(memory_space=kDefault, start_time=12, end_time=25)
	// - CopyAllocation(memory_space=kAlternate, start_time=22, end_time=25)
	class Allocation {
	public:
	Allocation(HloInstruction* instruction, HloPosition defining_position,
	MemorySpace memory_space, Chunk chunk, int64 start_time,
	int64 end_time)
	: instruction_(instruction),
	defining_position_(defining_position),
	memory_space_(memory_space),
	chunk_(chunk),
	start_time_(start_time),
	end_time_(end_time) {}
	virtual ~Allocation() = default;

	virtual bool is_copy_allocation() const { return false; }

	// Adds a use to this allocation.
	void AddUse(HloUse use);

	// Extends the end time of this allocation.
	void Extend(int64 end_time) { end_time_ = end_time; }

	// After all of the time ranges for the allocations have been assigned,
	// Process morphs the instructions affected to assign the memory spaces and
	// insert asynchronous copy instructions if necessary.
	virtual Status Process(MemorySpaceAssignment* memory_space_assignment);

	// Returns the instruction that produces this allocation. It might be
	// different than the instruction in defining_position (e.g., a
	// GetTupleElement instruction does not define the buffer).
	virtual HloInstruction* instruction() const { return instruction_; }

	// Returns the defining position for this allocation.
	HloPosition defining_position() const { return defining_position_; }

	const std::vector<HloUse>& uses() const { return uses_; }
	MemorySpace memory_space() const { return memory_space_; }
	Chunk chunk() const { return chunk_; }
	void set_start_time(int64 start_time) { start_time_ = start_time; }
	int64 start_time() const { return start_time_; }
	int64 end_time() const { return end_time_; }

	protected:
	HloInstruction* instruction_;
	HloPosition defining_position_;
	std::vector<HloUse> uses_;
	std::vector<HloInstruction*> bitcasts_;
	MemorySpace memory_space_;
	Chunk chunk_;
	int64 start_time_;
	int64 end_time_;
	};

	// This class represents an allocation as a result of an asynchronous copy.
	class CopyAllocation : public Allocation {
	public:
	CopyAllocation(const Allocation& prev_allocation, MemorySpace memory_space,
	Chunk chunk, int64 start_time, int64 end_time)
	: Allocation(/instruction=/nullptr,
	prev_allocation.defining_position(), memory_space, chunk,
	start_time, end_time),
	prev_allocation_(prev_allocation),
	copy_start_schedule_after_(start_time),
	copy_done_schedule_before_(end_time) {}

	bool is_copy_allocation() const override { return true; }

	Status Process(MemorySpaceAssignment* memory_space_assignment) override;

	HloInstruction* instruction() const override {
	// Unless explicitly set, the instruction of a copy allocation in
	// retrieved from the previous allocation.
	if (instruction_ != nullptr) {
	return instruction_;
	} else {
	return prev_allocation_.instruction();
	}
	}

	HloInstruction* copy_start() const { return copy_start_; }
	HloInstruction* copy_done() const { return copy_done_; }

	int64 copy_start_schedule_after() const {
	return copy_start_schedule_after_;
	}
	int64 copy_done_schedule_before() const {
	return copy_done_schedule_before_;
	}

	void set_copy_start_schedule_after(int64 copy_start_schedule_after) {
	copy_start_schedule_after_ = copy_start_schedule_after;
	}

	private:
	const Allocation& prev_allocation_;
	// These variables define the scheduling boundaries where CopyStart and
	// CopyDone can be scheduled. The earliest CopyStart can be scheduled is
	// after copy_start_schedule_after_ and the latest CopyDone can be scheduled
	// is before copy_done_schedule_before_.
	int64 copy_start_schedule_after_;
	int64 copy_done_schedule_before_;
	HloInstruction* copy_start_;
	HloInstruction* copy_done_;
	};

	using AllocationSequence = std::list<std::unique_ptr<Allocation>>;
	using AllocationMap =
	absl::flat_hash_map<const HloBuffer*, AllocationSequence>;

	// Runs the MemorySpaceAssignment pass. alternate_memory_space is the
	// architecture-specific integer value that describes the alternate memory.
	// max_size_in_bytes is the maximum size of the alternate memory.
	// min/max_prefetch_interval define min/max number of independent instructions
	// that can be overlapped while prefetching to decide how early can prefetch
	// begin. alternate_memory_space_alignment_in_bytes is the alignment required
	// in the alternate memory space, size_fn is the size function for buffer
	// values, and is_allowed_in_alternate_mem can be used to prevent certain
	// HloValues (e.g., based on the opcode) to be placed on the alternate memory.
	// max_outstanding_async_copies specifies the upper bound for number of
	// outstanding asynchronous copies, -1 for unlimited.
	// TODO(berkin): Use the cost model instead of using number of instructions to
	// decide how early to prefetch.
	static StatusOr<std::unique_ptr<PresetAssignments>> Run(
	HloModule* module, int64 alternate_memory_space, int64 max_size_in_bytes,
	int64 min_prefetch_interval, int64 max_prefetch_interval,
	int64 alternate_memory_space_alignment_in_bytes,
	BufferValue::SizeFunction size_fn,
	std::function<bool(const HloValue&)> is_allowed_in_alternate_mem,
	int64 max_outstanding_async_copies = -1);

	// Returns the maximum number of outstanding asynchronous copies in the
	// module.
	static int64 CountMaximumOutstandingAsyncCopies(const HloModule& module);

	private:
	MemorySpaceAssignment(HloModule* module, int64 alternate_memory_space)
	: module_(module),
	alternate_memory_space_(alternate_memory_space),
	preset_assignments_(absl::make_unique<PresetAssignments>()) {}

	// Process calls Process methods of the allocations after the allocations have
	// been finalized.
	Status Process();

	// FixSchedule inserts asynchronous copies in the schedule.
	Status FixSchedule();

	// Insert an instruction to the schedule, and make sure its dependencies
	// (operands) are already in the schedule. If not, insert these operands
	// before the instruction.
	void EnsureInstructionAndOperandsInserted(
	HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
	absl::flat_hash_set<HloInstruction> inserted_instructions) const;

	// Schedules asynchronous copies and ensures that the CopyStarts and their
	// corresponding CopyDones follow the same order.
	void ScheduleAsynchronousCopies();

	// Add the position to the pending positions that will be colored as alternate
	// memory.
	void AddPositionInAlternateMemorySpace(HloPosition position);

	HloModule* module_;
	int64 alternate_memory_space_;
	std::unique_ptr<HloLiveRange> hlo_live_range_;
	AllocationMap allocation_map_;
	std::unique_ptr<PresetAssignments> preset_assignments_;

	// These maps hold vectors of new instructions that need to be scheduled after
	// (or before) the instruction index in the key. FixSchedule uses these maps
	// to modify and fix the schedule.
	absl::flat_hash_map<int64, std::vector<HloInstruction*>> schedule_after_;
	absl::flat_hash_map<int64, std::vector<HloInstruction*>> schedule_before_;
	std::vector<HloPosition> pending_positions_in_alternate_mem_;
	};

	// This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
	// maximum size.
	class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
	public:
	using IsAllowedInAlternateMemoryFunction =
	std::function<bool(const HloValue&)>;
	using MemorySpace = MemorySpaceAssignment::MemorySpace;

	AlternateMemoryBestFitHeap(
	MemorySpaceAssignment::AllocationMap* allocation_map,
	int64 max_size_in_bytes, int64 min_prefetch_interval,
	int64 max_prefetch_interval, const HloAliasAnalysis& alias_analysis,
	const HloLiveRange& hlo_live_range, int64 alignment,
	GlobalDecreasingSizeBestFitHeap::Type type,
	IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem,
	int64 max_outstanding_async_copies)
	: GlobalDecreasingSizeBestFitHeap(alignment, type),
	allocation_map_(allocation_map),
	max_size_in_bytes_(max_size_in_bytes),
	min_prefetch_interval_(min_prefetch_interval),
	max_prefetch_interval_(max_prefetch_interval),
	alias_analysis_(alias_analysis),
	hlo_live_range_(hlo_live_range),
	is_allowed_in_alternate_mem_(is_allowed_in_alternate_mem),
	max_outstanding_async_copies_(max_outstanding_async_copies) {}

	HeapSimulator::Result Finish() override;

	private:
	// Finds an allocation for the given interval. Internally, it will attempt to
	// find a suitable chunk candidate within the heap size and prefetch interval
	// limits, and append the new allocation(s) to allocations. The new
	// allocations can be in default or alternate memory spaces, or can be
	// prefetches or evictions. Returns true if successful.
	bool FindAllocation(int64 start_time, int64 end_time, int64 last_use_time,
	HloPosition defining_position, HloUse use,
	const HloValue* buffer, int64 size,
	MemorySpaceAssignment::AllocationSequence* allocations);

	// Try allocating in alternate memory without any copies. Returns true if
	// successful.
	bool TryAllocatingInAlternateMemoryNoCopy(
	int64 start_time, int64 end_time, int64 last_use_time,
	HloPosition defining_position, HloUse use,
	BufferInterval alternate_mem_interval,
	HloInstruction* non_bitcast_operand,
	MemorySpaceAssignment::AllocationSequence* allocations);

	// Given a buffer interval, returns the colocated intervals. Unlike the
	// similar GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations, it
	// returns the colocated intervals sorted by scheduled time.
	std::vector<const BufferInterval*> GetSortedColocatedIntervals(
	const BufferInterval& interval) const;

	// Since the allocations are recorded to the AllocationMap, we don't maintain
	// result_ in GlobalDecreasingSizeBestFitHeap. Override AddToChunkMap to avoid
	// unnecessarily adding the chunk to the chunk map.
	void AddToChunkMap(const HloValue* buffer, Chunk chunk) override {}

	// Returns true if the addition of an asynchronous copy in the given time
	// interval would violate the maximum number of asynchronous copies.
	bool ViolatesMaximumOutstandingAsyncCopies(int64 start_time,
	int64 end_time) const;

	// Adds an asynchronous copy to the allocations.
	void AddAsyncCopy(const MemorySpaceAssignment::Allocation& prev_allocation,
	MemorySpace memory_space, Chunk chunk, int64 start_time,
	int64 end_time,
	MemorySpaceAssignment::AllocationSequence* allocations);

	// These methods are used for delaying committing the chunk candidate until
	// the entire live range of the buffer has been considered.
	void AddToPendingChunks(const BufferInterval& buffer_interval,
	const ChunkCandidate& chunk_candidate);
	void CommitPendingChunks();

	MemorySpaceAssignment::AllocationMap* allocation_map_;
	int64 max_size_in_bytes_;
	// The min and max prefetch intervals decribe the number of independent HLOs
	// overlapped while a value is being prefetched into the alternate memory
	// (between CopyStart and CopyDone HLO instructions). max_prefetch_interval
	// attempts to prevent bringing tensors into the alternate memory too eagerly
	// and hence occupying the space for other tensors which might use it.
	// min_prefetch_interval attempts to prevent cases where tensors are
	// prefetched into the alternate memory without sufficient time for the copy
	// to take place. In those cases, it's just better to keep the tensor in the
	// default memory instead of hurting the critical path with this copy that
	// likely won't finish in time.
	// TODO(berkin): Explore heuristics that take into account the cost of copying
	// tensors between alternate and default memories.
	int64 min_prefetch_interval_;
	int64 max_prefetch_interval_;
	const HloAliasAnalysis& alias_analysis_;
	const HloLiveRange& hlo_live_range_;
	IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem_;
	// We use a interval tree to keep track of the number of outstanding
	// asynchronous copies.
	BufferIntervalTree async_copy_interval_tree_;
	int64 max_outstanding_async_copies_;
	std::vector<std::pair<BufferInterval, ChunkCandidate>> pending_chunks_;
	std::vector<std::pair<int64, int64>> pending_async_copies_;
	};

	} // namespace xla

	#endif // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_