kythe/proto/entryset.proto - platform/external/kythe - Git at Google

 /*
  * Copyright 2017 The Kythe Authors. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 syntax = "proto3";

 package kythe.storage;

 option go_package = "entryset_go_proto";

 // An EntrySet represents a compact collection of entries.
 // Compaction is achieved by encoding node names and string-valued data as
 // table offsets, and by sorting and prefix-encoding all the strings.
 //
 // Even without more sophisticated compression this provides a fairly large
 // savings over fully-separate entries, even when encoded with protobuf
 // wire-format overhead.
 //
 // The format is somewhat expensive to construct, but not asymptotically bad,
 // and decoding is both simpler and less memory-intensive than encoding.
 message EntrySet {
   // The order of these fields reflects the standard ordering for vnames.
   // Each of the fields is an index into the symbol table.
   message Node {
     int32 corpus = 1;
     int32 language = 2;
     int32 path = 3;
     int32 root = 4;
     int32 signature = 5;
   }

   // TODO(fromberger): The standard ordering is actually defined in terms of
   // tickets rather than vnames, from back when entries carried tickets.  This
   // definition uses the same field ordering so as long as the ticket is in
   // canonical form I think it should be equivalent.

   // One entry for each unique node named in the entry set.  The index of a
   // node in this field is its id.
   //
   // If the nodes are stored in canonical vname order, the EntrySet is also
   // said to be in canonical order. However, an EntrySet is valid whether or
   // not this applies.
   repeated Node nodes = 1;

   message Fact {
     int32 name = 1;   // symbol
     int32 value = 2;  // symbol
   }
   message FactGroup {
     repeated Fact facts = 1;
   }

   // One entry for each node in the entry set. The index of a group in this
   // field matches the id of its corresponding node.
   repeated FactGroup fact_groups = 2;

   message Edge {
     int32 kind = 1;    // symbol
     int32 target = 2;  // node
   }
   message EdgeGroup {
     repeated Edge edges = 1;
   }

   // One entry for each node in the entry set. The index of a group in this
   // field matches the id of its corresponding source node.
   repeated EdgeGroup edge_groups = 3;

   message String {
     int32 prefix = 1;  // length of common prefix with predecessor (expanded)
     bytes suffix = 2;  // the unshared suffix
   }

   // A prefix-coded table of all the symbols referenced by the messages above.
   // The entries in this field are lexicographically ordered.  The string table
   // always implicitly contains the empty string as its first entry, but it is
   // not represented explicitly in the message.
   repeated String symbols = 4;
 }

 // TODO(fromberger): Some additional tricks to consider.
 //
 // Static dictionaries for common schema types (fact names, edge kinds). This
 // makes ordering harder, though, and doesn't seem to save much, though
 // amortized over many edge sets it might help more.
 //
 // Separate blobs for large values, e.g., files. This also makes ordering a
 // little harder, and large blobs are rarely repeated.
	/*
	* Copyright 2017 The Kythe Authors. All rights reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	syntax = "proto3";

	package kythe.storage;

	option go_package = "entryset_go_proto";

	// An EntrySet represents a compact collection of entries.
	// Compaction is achieved by encoding node names and string-valued data as
	// table offsets, and by sorting and prefix-encoding all the strings.
	//
	// Even without more sophisticated compression this provides a fairly large
	// savings over fully-separate entries, even when encoded with protobuf
	// wire-format overhead.
	//
	// The format is somewhat expensive to construct, but not asymptotically bad,
	// and decoding is both simpler and less memory-intensive than encoding.
	message EntrySet {
	// The order of these fields reflects the standard ordering for vnames.
	// Each of the fields is an index into the symbol table.
	message Node {
	int32 corpus = 1;
	int32 language = 2;
	int32 path = 3;
	int32 root = 4;
	int32 signature = 5;
	}

	// TODO(fromberger): The standard ordering is actually defined in terms of
	// tickets rather than vnames, from back when entries carried tickets. This
	// definition uses the same field ordering so as long as the ticket is in
	// canonical form I think it should be equivalent.

	// One entry for each unique node named in the entry set. The index of a
	// node in this field is its id.
	//
	// If the nodes are stored in canonical vname order, the EntrySet is also
	// said to be in canonical order. However, an EntrySet is valid whether or
	// not this applies.
	repeated Node nodes = 1;

	message Fact {
	int32 name = 1; // symbol
	int32 value = 2; // symbol
	}
	message FactGroup {
	repeated Fact facts = 1;
	}

	// One entry for each node in the entry set. The index of a group in this
	// field matches the id of its corresponding node.
	repeated FactGroup fact_groups = 2;

	message Edge {
	int32 kind = 1; // symbol
	int32 target = 2; // node
	}
	message EdgeGroup {
	repeated Edge edges = 1;
	}

	// One entry for each node in the entry set. The index of a group in this
	// field matches the id of its corresponding source node.
	repeated EdgeGroup edge_groups = 3;

	message String {
	int32 prefix = 1; // length of common prefix with predecessor (expanded)
	bytes suffix = 2; // the unshared suffix
	}

	// A prefix-coded table of all the symbols referenced by the messages above.
	// The entries in this field are lexicographically ordered. The string table
	// always implicitly contains the empty string as its first entry, but it is
	// not represented explicitly in the message.
	repeated String symbols = 4;
	}

	// TODO(fromberger): Some additional tricks to consider.
	//
	// Static dictionaries for common schema types (fact names, edge kinds). This
	// makes ordering harder, though, and doesn't seem to save much, though
	// amortized over many edge sets it might help more.
	//
	// Separate blobs for large values, e.g., files. This also makes ordering a
	// little harder, and large blobs are rarely repeated.