kythe/typescript/utf8.ts - platform/external/kythe - Git at Google

 /*
  * Copyright 2017 The Kythe Authors. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /**
  * This module defines an "OffsetTable", which maps UTF-16 offsets (used by
  * TypeScript) to byte offsets (used by Kythe).
  *
  * This module assumes inputs are already valid UTF-8.
  */

 /**
  * Scanner scans a UTF-8 Buffer across Unicode codepoints one by one.  It
  * exposes a public offset member to see the current (byte) offset.
  */
 class Scanner {
   constructor(private buf: Buffer, public ofs = 0) {}

   /** Scans forward one codepoint and returns the delta in UTF-16 offset. */
   scan(): number {
     const byte = this.buf[this.ofs++];

     // This code interprets the bytes following the bit patterns found in
     //   https://en.wikipedia.org/wiki/UTF-8#Description
     // to scan forward by one code point.

     if ((byte & 0b10000000) === 0) {
       // Common case: ASCII.
       return 1;
     }

     if ((byte & 0b11100000) === 0b11000000) {
       this.ofs += 1;
       return 1;
     }

     if ((byte & 0b11110000) === 0b11100000) {
       this.ofs += 2;
       return 1;
     }

     if ((byte & 0b11111000) === 0b11110000) {
       this.ofs += 3;
       // A surrogate pair is length 2 in Node.js.
       return 2;
     }

     throw new Error(`unhandled UTF-8 byte 0x${byte.toString(16)}`);
   }
 }

 /**
  * OffsetTable caches a UTF-16 -> UTF-8 offset mapping.
  */
 export class OffsetTable {
   /**
    * Holds [utf16 offset, byte offset] pairs, with each entry at least spanSize
    * after the one before.
    *
    * Hypothetically if spanSize was 1, then the table would hold
    *   [0, 0]
    *   [1, byte offset of first character]
    *   [2, byte offset of second character]
    * and so on.
    *
    * When spanSize is greater than 1, we skip intermediate entries,
    * so the first entry after zero is [spanSize, ...] and the second is
    * [spanSize*2, ...].
    *
    * There is an exception when the input UTF-16 offset is a surrogate pair.
    * Assume the UTF-16 offset is x. Since the length of a surrogate pair in
    * Node.js in 2, x+1 is still within the surrogate pair so we will skip x+1.
    * If x+1 is spanSize*n use x+2 as the start of the span instead.
    *
    * To look up the byte offset of an input UTF-16 offset, we find the first
    * span occurring before the queried offset (which we can compute using simple
    * math using spanSize) and then repeat the scan forwards to find the offset
    * within the span.
    *
    * A larger spanSize saves memory (fewer entries in the table) at the cost
    * of more CPU (need to do more scanning to find an offset).  In practice
    * it doesn't really matter that much because our input files are pretty
    * small, but the table is at least nice to prevent an O(1) scan from the
    * beginning of the file for every requested offset.
    */
   offsets: Array<[number, number]> = [];

   constructor(public buf: Buffer, private spanSize = 128) {
     this.build(buf);
   }

   private build(bytes: Buffer) {
     this.offsets = [];
     const scanner = new Scanner(bytes);
     let ofs = 0;
     let lastEntry = 0;
     this.offsets.push([ofs, scanner.ofs]);
     while (scanner.ofs < bytes.length) {
       ofs += scanner.scan();
       if (ofs - lastEntry >= this.spanSize) {
         this.offsets.push([ofs, scanner.ofs]);
         lastEntry += this.spanSize;
       }
     }
   }

   lookup(findOfs: number): number {
     const offset = this.offsets[Math.floor(findOfs / this.spanSize)];
     let u16 = offset[0];
     const byte = offset[1];
     // u16 and byte is a pair of UTF-16 and UTF-8 offsets. Scan forward to find
     // the offset to lookup for.
     const scanner = new Scanner(this.buf, byte);
     while (u16 < findOfs) {
       u16 += scanner.scan();
     }
     // We are scanning UTF-16 offsets one by one. If it skips findOfs it means
     // that the findOfs is in the middle of a surrogate pair, which is invalid
     // to lookup for.
     if (u16 > findOfs) {
       throw new Error(`The lookup offset is invalid`);
     }
     return scanner.ofs;
   }
 }
	/*
	* Copyright 2017 The Kythe Authors. All rights reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/**
	* This module defines an "OffsetTable", which maps UTF-16 offsets (used by
	* TypeScript) to byte offsets (used by Kythe).
	*
	* This module assumes inputs are already valid UTF-8.
	*/

	/**
	* Scanner scans a UTF-8 Buffer across Unicode codepoints one by one. It
	* exposes a public offset member to see the current (byte) offset.
	*/
	class Scanner {
	constructor(private buf: Buffer, public ofs = 0) {}

	/** Scans forward one codepoint and returns the delta in UTF-16 offset. */
	scan(): number {
	const byte = this.buf[this.ofs++];

	// This code interprets the bytes following the bit patterns found in
	// https://en.wikipedia.org/wiki/UTF-8#Description
	// to scan forward by one code point.

	if ((byte & 0b10000000) === 0) {
	// Common case: ASCII.
	return 1;
	}

	if ((byte & 0b11100000) === 0b11000000) {
	this.ofs += 1;
	return 1;
	}

	if ((byte & 0b11110000) === 0b11100000) {
	this.ofs += 2;
	return 1;
	}

	if ((byte & 0b11111000) === 0b11110000) {
	this.ofs += 3;
	// A surrogate pair is length 2 in Node.js.
	return 2;
	}

	throw new Error(`unhandled UTF-8 byte 0x${byte.toString(16)}`);
	}
	}

	/**
	* OffsetTable caches a UTF-16 -> UTF-8 offset mapping.
	*/
	export class OffsetTable {
	/**
	* Holds [utf16 offset, byte offset] pairs, with each entry at least spanSize
	* after the one before.
	*
	* Hypothetically if spanSize was 1, then the table would hold
	* [0, 0]
	* [1, byte offset of first character]
	* [2, byte offset of second character]
	* and so on.
	*
	* When spanSize is greater than 1, we skip intermediate entries,
	* so the first entry after zero is [spanSize, ...] and the second is
	* [spanSize*2, ...].
	*
	* There is an exception when the input UTF-16 offset is a surrogate pair.
	* Assume the UTF-16 offset is x. Since the length of a surrogate pair in
	* Node.js in 2, x+1 is still within the surrogate pair so we will skip x+1.
	* If x+1 is spanSize*n use x+2 as the start of the span instead.
	*
	* To look up the byte offset of an input UTF-16 offset, we find the first
	* span occurring before the queried offset (which we can compute using simple
	* math using spanSize) and then repeat the scan forwards to find the offset
	* within the span.
	*
	* A larger spanSize saves memory (fewer entries in the table) at the cost
	* of more CPU (need to do more scanning to find an offset). In practice
	* it doesn't really matter that much because our input files are pretty
	* small, but the table is at least nice to prevent an O(1) scan from the
	* beginning of the file for every requested offset.
	*/
	offsets: Array<[number, number]> = [];

	constructor(public buf: Buffer, private spanSize = 128) {
	this.build(buf);
	}

	private build(bytes: Buffer) {
	this.offsets = [];
	const scanner = new Scanner(bytes);
	let ofs = 0;
	let lastEntry = 0;
	this.offsets.push([ofs, scanner.ofs]);
	while (scanner.ofs < bytes.length) {
	ofs += scanner.scan();
	if (ofs - lastEntry >= this.spanSize) {
	this.offsets.push([ofs, scanner.ofs]);
	lastEntry += this.spanSize;
	}
	}
	}

	lookup(findOfs: number): number {
	const offset = this.offsets[Math.floor(findOfs / this.spanSize)];
	let u16 = offset[0];
	const byte = offset[1];
	// u16 and byte is a pair of UTF-16 and UTF-8 offsets. Scan forward to find
	// the offset to lookup for.
	const scanner = new Scanner(this.buf, byte);
	while (u16 < findOfs) {
	u16 += scanner.scan();
	}
	// We are scanning UTF-16 offsets one by one. If it skips findOfs it means
	// that the findOfs is in the middle of a surrogate pair, which is invalid
	// to lookup for.
	if (u16 > findOfs) {
	throw new Error(`The lookup offset is invalid`);
	}
	return scanner.ofs;
	}
	}