blob: b02655ca4176dba31952106207827db7d360850d [file] [log] [blame]
// Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "bsdiff/suffix_array_index.h"
#include <limits>
#include <vector>
#include <divsufsort.h>
#include <divsufsort64.h>
#include "bsdiff/logging.h"
namespace {
// libdivsufsort C++ overloaded functions used to allow calling the right
// implementation based on the pointer size.
int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
return divsufsort(text, sa, n);
}
int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
return divsufsort64(text, sa, n);
}
saidx_t CallSaSearch(const uint8_t* text,
size_t text_size,
const uint8_t* pattern,
size_t pattern_size,
const saidx_t* sa,
size_t sa_size,
saidx_t* left) {
return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
}
saidx64_t CallSaSearch(const uint8_t* text,
size_t text_size,
const uint8_t* pattern,
size_t pattern_size,
const saidx64_t* sa,
size_t sa_size,
saidx64_t* left) {
return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
}
} // namespace
namespace bsdiff {
// The SAIDX template type must be either saidx_t or saidx64_t, which will
// depend on the maximum size of the input text needed.
template <typename SAIDX>
class SuffixArrayIndex : public SuffixArrayIndexInterface {
public:
SuffixArrayIndex() = default;
// Initialize and construct the suffix array of the |text| string of length
// |n|. The memory pointed by |text| must be kept alive. Returns whether the
// construction succeeded.
bool Init(const uint8_t* text, size_t n);
// SuffixArrayIndexInterface overrides.
void SearchPrefix(const uint8_t* target,
size_t length,
size_t* out_length,
uint64_t* out_pos) const override;
private:
const uint8_t* text_{nullptr}; // Owned by the caller.
size_t n_{0};
std::vector<SAIDX> sa_;
};
template <typename SAIDX>
bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
if (!sa_.empty()) {
// Already initialized.
LOG(ERROR) << "SuffixArray already initialized";
return false;
}
if (static_cast<uint64_t>(n) >
static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
LOG(ERROR) << "Input too big (" << n << ") for this implementation";
return false;
}
text_ = text;
n_ = n;
sa_.resize(n + 1);
if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
LOG(ERROR) << "divsufsrot() failed";
return false;
}
return true;
}
template <typename SAIDX>
void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
size_t length,
size_t* out_length,
uint64_t* out_pos) const {
SAIDX suf_left;
SAIDX count =
CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
if (count > 0) {
// This is the simple case where we found the whole |target| string was
// found.
*out_pos = sa_[suf_left];
*out_length = length;
return;
}
// In this case, |suf_left| points to the first suffix array position such
// that the suffix at that position is lexicographically larger than |target|.
// We only need to check whether the previous entry or the current entry is a
// longer match.
size_t prev_suffix_len = 0;
if (suf_left > 0) {
const size_t prev_max_len =
std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
prev_suffix_len =
std::mismatch(target, target + prev_max_len, prev_suffix).first -
target;
}
size_t next_suffix_len = 0;
if (static_cast<size_t>(suf_left) < n_) {
const uint8_t* next_suffix = text_ + sa_[suf_left];
const size_t next_max_len =
std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
next_suffix_len =
std::mismatch(target, target + next_max_len, next_suffix).first -
target;
}
*out_length = std::max(next_suffix_len, prev_suffix_len);
if (!*out_length) {
*out_pos = 0;
} else if (next_suffix_len >= prev_suffix_len) {
*out_pos = sa_[suf_left];
} else {
*out_pos = sa_[suf_left - 1];
}
}
std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
const uint8_t* text,
size_t n) {
// The maximum supported size when using the suffix array based on the 32-bit
// saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
// than the maximum representable number so references like "n + 1" are don't
// overflow.
const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
std::unique_ptr<SuffixArrayIndexInterface> ret;
if (n > kMaxSaidxSize) {
SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
ret.reset(sa_ptr);
if (!sa_ptr->Init(text, n))
return nullptr;
} else {
SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
ret.reset(sa_ptr);
if (!sa_ptr->Init(text, n))
return nullptr;
}
return ret;
}
} // namespace bsdiff