blob: f9803c851460185141a6eb615160bf282e0adb51 [file] [log] [blame]
/*
* Copyright 2017 The Kythe Authors. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
syntax = "proto3";
package kythe.storage;
option go_package = "entryset_go_proto";
// An EntrySet represents a compact collection of entries.
// Compaction is achieved by encoding node names and string-valued data as
// table offsets, and by sorting and prefix-encoding all the strings.
//
// Even without more sophisticated compression this provides a fairly large
// savings over fully-separate entries, even when encoded with protobuf
// wire-format overhead.
//
// The format is somewhat expensive to construct, but not asymptotically bad,
// and decoding is both simpler and less memory-intensive than encoding.
message EntrySet {
// The order of these fields reflects the standard ordering for vnames.
// Each of the fields is an index into the symbol table.
message Node {
int32 corpus = 1;
int32 language = 2;
int32 path = 3;
int32 root = 4;
int32 signature = 5;
}
// TODO(fromberger): The standard ordering is actually defined in terms of
// tickets rather than vnames, from back when entries carried tickets. This
// definition uses the same field ordering so as long as the ticket is in
// canonical form I think it should be equivalent.
// One entry for each unique node named in the entry set. The index of a
// node in this field is its id.
//
// If the nodes are stored in canonical vname order, the EntrySet is also
// said to be in canonical order. However, an EntrySet is valid whether or
// not this applies.
repeated Node nodes = 1;
message Fact {
int32 name = 1; // symbol
int32 value = 2; // symbol
}
message FactGroup {
repeated Fact facts = 1;
}
// One entry for each node in the entry set. The index of a group in this
// field matches the id of its corresponding node.
repeated FactGroup fact_groups = 2;
message Edge {
int32 kind = 1; // symbol
int32 target = 2; // node
}
message EdgeGroup {
repeated Edge edges = 1;
}
// One entry for each node in the entry set. The index of a group in this
// field matches the id of its corresponding source node.
repeated EdgeGroup edge_groups = 3;
message String {
int32 prefix = 1; // length of common prefix with predecessor (expanded)
bytes suffix = 2; // the unshared suffix
}
// A prefix-coded table of all the symbols referenced by the messages above.
// The entries in this field are lexicographically ordered. The string table
// always implicitly contains the empty string as its first entry, but it is
// not represented explicitly in the message.
repeated String symbols = 4;
}
// TODO(fromberger): Some additional tricks to consider.
//
// Static dictionaries for common schema types (fact names, edge kinds). This
// makes ordering harder, though, and doesn't seem to save much, though
// amortized over many edge sets it might help more.
//
// Separate blobs for large values, e.g., files. This also makes ordering a
// little harder, and large blobs are rarely repeated.