blob: 7e2195f3637cf4dd6dd5dadd60c5a237640b69f4 [file] [log] [blame]
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <list>
#include "mlir-hlo/Analysis/userange_analysis.h"
#include "mlir-hlo/Transforms/PassDetail.h"
#include "mlir-hlo/Transforms/passes.h"
#include "mlir-hlo/utils/hlo_utils.h"
#include "mlir/Analysis/BufferViewFlowAnalysis.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/Operation.h"
#include "mlir/Pass/Pass.h"
namespace mlir {
namespace {
/// Returns the length of an userange interval.
size_t computeUserangeSize(const UseInterval &interval) {
return interval.end - interval.start + 1;
}
/// Compute the byte size of a given Value.
size_t computeByteSize(const Value &v) {
auto type = v.getType().cast<ShapedType>();
return type.getSizeInBits() / 8;
}
/// Compute the 64 byte alinged segments of a given Value.
size_t computeAlignedSegments(const Value &v) {
size_t padding = 64;
size_t bytes = computeByteSize(v);
return std::ceil(bytes / (double)padding);
}
/// The buffer offset information.
struct AllocBufferOffset {
public:
AllocBufferOffset(Value source, size_t offset)
: source(source), offset(offset) {}
Value source;
size_t offset;
};
/// Contains the information to create a new buffer, that is used to pack
/// other buffers.
struct PackedBuffer {
public:
PackedBuffer(size_t numSegments,
std::vector<AllocBufferOffset> &packedBuffers)
: numSegments(numSegments), allocBufferOffsets(packedBuffers) {}
size_t numSegments;
std::vector<AllocBufferOffset> allocBufferOffsets;
};
/// Contains the information about a buffers allocation for sorting and checking
/// if it fits into other buffers and vise versa.
/// This structure contains the allocation value, the first and last userangeid
/// of a buffer, the window id, the number of alligned 64 byte segments and all
/// userange intervals.
struct AllocationInfo {
public:
AllocationInfo(Value alloc, size_t allocUserangeId, size_t firstUse,
size_t lastUse, size_t numSegments, size_t windowId,
const UseInterval::Vector *userangeIntervals)
: alloc(alloc),
allocUserangeId(allocUserangeId),
firstUse(firstUse),
lastUse(lastUse),
numSegments(numSegments),
windowId(windowId),
userangeIntervals(userangeIntervals) {}
/// The allocation value.
Value alloc;
/// The id of allocation based on the Userange Analysis.
size_t allocUserangeId;
/// The first use of the buffer.
size_t firstUse;
/// The last use of the buffer based on the Userange Analysis.
size_t lastUse;
/// The number of 64 byte aligned segments of contigous memory.
size_t numSegments;
/// The window id of the allocation position.
size_t windowId;
/// The userange intervals of the buffer.
const UseInterval::Vector *userangeIntervals;
/// Compute the gaps of the alloc userange with the number of segments. The
/// maxUserangeId is used to add a dummy gap from the last used id to the
/// maxUserangeId. By default the maxUserangeId is zero and no gap is added.
std::list<std::pair<UseInterval, size_t>> computeGaps(
size_t maxUserangeId = 0) {
std::list<std::pair<UseInterval, size_t>> gaps;
// The previous gap ending, initially set to 0.
size_t gapEnd = 0;
for (const auto *useRangeIter = userangeIntervals->begin();
useRangeIter < userangeIntervals->end(); ++useRangeIter) {
// Add a gap if the end is not equal to the start.
if (gapEnd < useRangeIter->start)
gaps.emplace_back(UseInterval(gapEnd, useRangeIter->start - 1),
numSegments);
gapEnd = useRangeIter->end + 1;
}
// Add a dummy gap behind the last use of the buffer.
if (gapEnd < maxUserangeId) {
gaps.emplace_back(UseInterval(gapEnd, maxUserangeId), numSegments);
}
return gaps;
}
/// Compute the userange size.
size_t getUserangeSize() const { return lastUse - firstUse + 1; }
};
// Comparator to sort allocation informations by window id, userange and by
// number of memory segments.
class AllocInfoWinIdComparator {
public:
bool operator()(const AllocationInfo &a, const AllocationInfo &b) {
if (a.windowId == b.windowId) {
if (a.allocUserangeId == b.allocUserangeId)
return a.numSegments > b.numSegments;
return a.allocUserangeId > b.allocUserangeId;
}
return a.windowId < b.windowId;
}
};
// Comparator to sort the allocation informations by number of segments.
class AllocInfoMemSizeCompare {
public:
bool operator()(const AllocationInfo &a, const AllocationInfo &b) {
return a.numSegments > b.numSegments;
}
};
/// This approach computes an allocation information list and sorts it by
/// a given comparator. From top to bottom the algortihm tries to fill userange
/// gaps with appropriate buffers behind it, to optimze the memory. It is a bin
/// packing approach.
template <typename CompareT>
class SortedPackingStrategy {
public:
using AllocInfoList = std::vector<AllocationInfo>;
public:
/// Constructs the Sorted Packing Strategy. The window size is used as sliding
/// window size. Allocation userangepositions that are in the same range are
/// mapped to the same window id. So the information of the allocation
/// starting position is blured.
SortedPackingStrategy(size_t windowSize, CompareT compare)
: windowSize(windowSize), compare(compare) {}
/// Optimize the buffer allocations.
void optimze(const mlir::bufferization::BufferPlacementAllocs &allocs,
const UserangeAnalysis &userangeAnalysis,
std::vector<PackedBuffer> &packedBuffers) {
AllocInfoList allocInfos;
allocInfos.reserve(std::distance(allocs.begin(), allocs.end()));
// Create allocInformations and store them in allocInfos.
size_t maxUserangeId =
computeAllocationInfos(allocInfos, userangeAnalysis, allocs);
// Sort the allocation infos.
std::sort(allocInfos.begin(), allocInfos.end(), compare);
for (auto currentIter = allocInfos.begin(); currentIter != allocInfos.end();
++currentIter) {
std::vector<AllocBufferOffset> allocBufferOffsets{
AllocBufferOffset(currentIter->alloc, 0)};
// Compute userange gaps.
std::list<std::pair<UseInterval, size_t>> gaps =
currentIter->computeGaps(maxUserangeId);
if (gaps.empty()) continue;
for (auto checkedAllocInfoIter = std::next(currentIter);
checkedAllocInfoIter != allocInfos.end();) {
// Check if a gap exists to pack the memory into.
// If not continue.
if (!findGapAndUpdate(gaps, allocBufferOffsets, *checkedAllocInfoIter,
*currentIter)) {
++checkedAllocInfoIter;
continue;
}
checkedAllocInfoIter = allocInfos.erase(checkedAllocInfoIter);
}
// Add the current buffer offets to the packed infos.
packedBuffers.emplace_back(currentIter->numSegments * 64,
allocBufferOffsets);
}
}
private:
const size_t windowSize;
const CompareT compare;
/// We try to find an appropriate userange gap to pack the buffer into it.
/// If we find one we update only the gaps and the buffer offset map.
bool findGapAndUpdate(std::list<std::pair<UseInterval, size_t>> &gaps,
std::vector<AllocBufferOffset> &allocBufferOffsets,
const AllocationInfo &allocToPack,
const AllocationInfo &allocToPackInto) {
// Check if the buffer to pack into has enough memory.
if (allocToPackInto.numSegments < allocToPack.numSegments) return false;
for (auto gapIter = gaps.begin(); gapIter != gaps.end();) {
// The list is sorted, so we can break here.
if (gapIter->first.start > allocToPack.firstUse) break;
// Checks if enough contiguous memory segments are free or if the current
// gap is out of bounds.
if (gapIter->second < allocToPack.numSegments ||
allocToPack.firstUse < gapIter->first.start ||
allocToPack.lastUse > gapIter->first.end) {
++gapIter;
continue;
}
// Stores the packed buffer with the offset.
allocBufferOffsets.emplace_back(
allocToPack.alloc,
(allocToPackInto.numSegments - gapIter->second) * 64);
// Update gap segments, will removed later if no free contigous memory
// exists. It is needed to split the interval, if not the full gap is
// used.
size_t freeContiguousMemory = gapIter->second;
gapIter->second = freeContiguousMemory - allocToPack.numSegments;
// Check if the gap must be splitted. If so, then the current gap must be
// trimmed accordingly. Therefore, new gaps are created in front and after
// the current gap.
if (computeUserangeSize(gapIter->first) > allocToPack.getUserangeSize()) {
size_t oldStart = gapIter->first.start;
size_t oldEnd = gapIter->first.end;
gapIter->first.end = allocToPack.lastUse;
gapIter->first.start = allocToPack.firstUse;
// Insert a new gap behind.
if (allocToPack.lastUse < oldEnd)
gaps.insert(
std::next(gapIter),
std::make_pair(UseInterval(allocToPack.lastUse + 1, oldEnd),
freeContiguousMemory));
// Insert a new gap before.
if (allocToPack.firstUse > oldStart)
gaps.insert(
gapIter,
std::make_pair(UseInterval(oldStart, allocToPack.firstUse - 1),
freeContiguousMemory));
}
// If a gap interval has no free contiguous memory anymore, erease it from
// list.
if (gapIter->second <= 0) gapIter = gaps.erase(gapIter);
return true;
}
return false;
}
/// Aggreagtes the allocation informations of the allocs and returns the
/// maximal userange.
size_t computeAllocationInfos(
AllocInfoList &allocInfos, const UserangeAnalysis &userangeAnalysis,
const mlir::bufferization::BufferPlacementAllocs &allocs) {
// Create allocInformations and store them in allocInfos.
size_t maxUserangeId = 0;
for (auto &allocEntry : allocs) {
Value v = std::get<0>(allocEntry);
auto userangeIntervals = userangeAnalysis.getUserangeInterval(v);
if (!userangeIntervals) continue;
// Computes the userange id of the allocation.
size_t allocUserangeId = userangeAnalysis.computeId(v, v.getDefiningOp());
// Computes the last use of the allocated buffer.
size_t lastUse = std::prev((*userangeIntervals.getValue()).end())->end;
// Computes the first use of the allocated buffer.
size_t firstUse = (*userangeIntervals.getValue()).begin()->start;
// Computes the number of aligend segments of the buffer.
size_t numSegments = computeAlignedSegments(v);
maxUserangeId = std::max(maxUserangeId, lastUse);
allocInfos.emplace_back(v, allocUserangeId, firstUse, lastUse,
numSegments, 0, userangeIntervals.getValue());
}
// If the window size is zero we need no sorting anymore.
if (windowSize == 0) return maxUserangeId;
// Sorts the allocation informations to compute the window id. The window id
// is used to blur the userange starting position of an allocation.
std::sort(allocInfos.begin(), allocInfos.end(),
[](const AllocationInfo &a, const AllocationInfo &b) {
return a.allocUserangeId < b.allocUserangeId;
});
// resize window id
size_t windowId = 0;
size_t lastAllocUserangeId = 0;
for (auto &allocationInfo : allocInfos) {
if (allocationInfo.allocUserangeId > lastAllocUserangeId + windowSize)
++windowId;
lastAllocUserangeId = allocationInfo.allocUserangeId;
allocationInfo.windowId = windowId;
}
return maxUserangeId;
}
};
/// Pass to pack buffer together to optimize the memeory consumption and to
/// save allocation operations. A strategy must be passed as a template
/// argument.
class BufferPacking : bufferization::BufferPlacementTransformationBase {
public:
template <typename StrategyT>
BufferPacking(Operation *op, StrategyT strategy)
: BufferPlacementTransformationBase(op),
userangeAnalysis(op, allocs, aliases),
dominators(op) {
std::vector<PackedBuffer> packedBuffers;
strategy.optimze(allocs, userangeAnalysis, packedBuffers);
for (auto &packedBuffer : packedBuffers) {
// Find common dominators.
Block *block = findAllocationsDominator(packedBuffer.allocBufferOffsets);
// Find alloc position operation.
mlir::OpBuilder packBuilder(&(block->front()));
auto location = block->front().getLoc();
auto memrefType =
MemRefType::get({static_cast<int64_t>(packedBuffer.numSegments)},
packBuilder.getIntegerType(8));
Value targetBuffer =
packBuilder.create<memref::AllocOp>(location, memrefType);
for (auto &packInfo : packedBuffer.allocBufferOffsets) {
Value currentAlloc = packInfo.source;
size_t offset = packInfo.offset;
Operation *viewDefOp = currentAlloc.getDefiningOp();
Location loc = viewDefOp->getLoc();
mlir::OpBuilder viewBuilder(viewDefOp);
// Create a arithmetic ConstantOp with the aligned offset.
Value constantOp = viewBuilder.create<mlir::arith::ConstantOp>(
loc, viewBuilder.getIndexType(),
viewBuilder.getIntegerAttr(viewBuilder.getIndexType(), offset));
// Store the operands for the ViewOp.
SmallVector<Value, 4> newOperands{targetBuffer};
newOperands.push_back(constantOp);
auto shape = currentAlloc.getType().cast<MemRefType>();
// Create a ViewOp with the shape of the old alloc and use the created
// packed alloc and the constant for the operands.
Value viewOp =
viewBuilder.create<memref::ViewOp>(loc, shape, newOperands);
// Replace all old allocs references with the created ViewOp and
// afterwards remove the old allocs.
currentAlloc.replaceAllUsesWith(viewOp);
viewDefOp->erase();
}
}
}
private:
UserangeAnalysis userangeAnalysis;
/// The current dominance info.
DominanceInfo dominators;
/// Find the block that dominates all buffer allocations.
Block *findAllocationsDominator(
const std::vector<AllocBufferOffset> &packingInfos) {
SmallPtrSet<Value, 16> allocValues;
for (auto &packInfo : packingInfos) {
allocValues.insert(packInfo.source);
}
// Find common dominators.
return findCommonDominator(packingInfos.begin()->source, allocValues,
dominators);
}
};
/// Tries to pack allocated buffer together to save allocation operations and
/// memory. The window size is used as sliding window size. Allocation
/// userangepoitions that are in the same range are mapped to the same window
/// id. The information of the allocation starting position is blured.
struct BufferPackingPass : public BufferPackingBase<BufferPackingPass> {
explicit BufferPackingPass(unsigned windowSize) {
this->window_size_ = windowSize;
}
void runOnOperation() override {
if (window_size_ == 0) {
SortedPackingStrategy<AllocInfoMemSizeCompare> strategy(
window_size_, AllocInfoMemSizeCompare());
BufferPacking packing(getOperation(), strategy);
} else {
SortedPackingStrategy<AllocInfoWinIdComparator> strategy(
window_size_, AllocInfoWinIdComparator());
BufferPacking packing(getOperation(), strategy);
}
}
};
/// Pass to find all allocations and to compute memory usage.
struct MemoryCountPass : MemoryCountBase<MemoryCountPass> {
void runOnOperation() override {
Operation *op = getOperation();
std::vector<Value> allocs;
op->walk([&](MemoryEffectOpInterface opInterface) {
// Try to find a single allocation result.
SmallVector<MemoryEffects::EffectInstance, 2> effects;
opInterface.getEffects(effects);
SmallVector<MemoryEffects::EffectInstance, 2> allocateResultEffects;
llvm::copy_if(
effects, std::back_inserter(allocateResultEffects),
[=](MemoryEffects::EffectInstance &it) {
Value value = it.getValue();
return isa<MemoryEffects::Allocate>(it.getEffect()) && value &&
value.isa<OpResult>() &&
it.getResource() !=
SideEffects::AutomaticAllocationScopeResource::get();
});
if (allocateResultEffects.size() != 1) return;
// Insert allocation.
allocs.push_back(allocateResultEffects[0].getValue());
});
auto output = mlir::hlo::computeMemory(allocs);
llvm::outs() << "Memory Count Pass:\n"
<< output.first << ";" << output.second << "\n";
}
};
} // namespace
std::unique_ptr<OperationPass<func::FuncOp>> createBufferPackingPass(
unsigned window_size) {
return std::make_unique<BufferPackingPass>(window_size);
}
std::unique_ptr<OperationPass<func::FuncOp>> createMemoryCountPass() {
return std::make_unique<MemoryCountPass>();
}
} // namespace mlir