binary/float16.go - platform/tools/gpu - Git at Google

 // Copyright (C) 2014 The Android Open Source Project
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package binary

 import "unsafe"

 // Float16 represents a 16-bit floating point number, containing a single sign bit, 5 exponent bits
 // and 10 fractional bits. This corresponds to IEEE 754-2008 binary16 (or half precision float) type.
 //
 //    MSB                                                                         LSB
 //   ╔════╦════╤════╤════╤════╤════╦════╤════╤════╤════╤════╤════╤════╤════╤════╤════╗
 //   ║Sign║ E₄ │ E₃ │ E₂ │ E₁ │ E₀ ║ F₉ │ F₈ │ F₇ │ F₆ │ F₅ │ F₄ │ F₃ │ F₂ │ F₁ │ F₀ ║
 //   ╚════╩════╧════╧════╧════╧════╩════╧════╧════╧════╧════╧════╧════╧════╧════╧════╝
 //   Where E is the exponent bits and F is the fractional bits.
 type Float16 uint16

 const (
 	float16ExpMask  Float16 = 0x7c00
 	float16ExpBias  uint32  = 0xf
 	float16ExpShift uint32  = 10
 	float16FracMask Float16 = 0x03ff
 	float16SignMask Float16 = 0x8000
 	float32ExpMask  uint32  = 0x7f800000
 	float32ExpBias  uint32  = 0x7f
 	float32ExpShift uint32  = 23
 	float32FracMask uint32  = 0x007fffff
 )

 // Float32 returns the Float16 value expanded to a float32. Infinities and NaNs are expanded as
 // such.
 func (f Float16) Float32() float32 {
 	u32 := expandF16ToF32(f)
 	ptr := unsafe.Pointer(&u32)
 	f32 := *(*float32)(ptr)
 	return f32
 }

 // IsNaN reports whether f is an “not-a-number” value.
 func (f Float16) IsNaN() bool { return (f&float16ExpMask == float16ExpMask) && (f&float16FracMask != 0) }

 // IsInf reports whether f is an infinity, according to sign. If sign > 0, IsInf reports whether
 // f is positive infinity. If sign < 0, IsInf reports whether f is negative infinity. If sign ==
 // 0, IsInf reports whether f is either infinity.
 func (f Float16) IsInf(sign int) bool {
 	return ((f == float16ExpMask) && sign >= 0) ||
 		(f == (float16SignMask|float16ExpMask) && sign <= 0)
 }

 // Float16NaN returns an “not-a-number” value.
 func NewFloat16NaN() Float16 { return float16ExpMask | float16FracMask }

 // Float16Inf returns positive infinity if sign >= 0, negative infinity if sign < 0.
 func NewFloat16Inf(sign int) Float16 {
 	if sign >= 0 {
 		return float16ExpMask
 	} else {
 		return float16SignMask | float16ExpMask
 	}
 }

 // NewFloat16 returns a Float16 encoding of a 32-bit floating point number. Infinities and NaNs
 // are encoded as such. Very large and very small numbers get rounded to infinity and zero
 // respectively.
 func NewFloat16(f32 float32) Float16 {
 	ptr := unsafe.Pointer(&f32)
 	u32 := *(*uint32)(ptr)

 	sign := Float16(u32>>16) & float16SignMask
 	exp := (u32 & float32ExpMask) >> float32ExpShift
 	frac := u32 & 0x7fffff

 	if exp == 0xff {
 		// NaN or Infinity
 		if frac != 0 { // NaN
 			frac = 0x3f
 		}

 		return sign | float16ExpMask | Float16(frac)
 	}

 	if exp+float16ExpBias <= float32ExpBias {
 		// Exponent is too small to represent in a Float16 (or a zero). We need to output
 		// denormalized numbers (possibly rounding very small numbers to zero).
 		denorm := float32ExpBias - exp - 1
 		frac += 1 << float32ExpShift
 		frac >>= denorm
 		return sign | Float16(frac)
 	}

 	if exp > float32ExpBias+float16ExpBias {
 		// Number too large to represent in a Float16 => round to Infinity.
 		return sign | float16ExpMask
 	}

 	// General case.
 	return sign | Float16(((exp+float16ExpBias-float32ExpBias)<<float16ExpShift)|(frac>>13))
 }

 func expandF16ToF32(in Float16) uint32 {
 	sign := uint32(in&float16SignMask) << 16
 	frac := uint32(in&float16FracMask) << 13
 	exp := uint32(in&float16ExpMask) >> float16ExpShift

 	if exp == 0x1f {
 		// NaN of Infinity
 		return sign | float32ExpMask | frac
 	}

 	if exp == 0 {
 		if frac == 0 {
 			// Zero
 			return sign
 		}
 		// Denormalized number. In a float32 it must be stored in a normalized form, so
 		// we normalize it.
 		exp++
 		for frac&float32ExpMask == 0 {
 			frac <<= 1
 			exp--
 		}
 		frac &= float32FracMask
 	}

 	exp += (float32ExpBias - float16ExpBias)

 	return sign | (exp << float32ExpShift) | frac
 }
	// Copyright (C) 2014 The Android Open Source Project
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package binary

	import "unsafe"

	// Float16 represents a 16-bit floating point number, containing a single sign bit, 5 exponent bits
	// and 10 fractional bits. This corresponds to IEEE 754-2008 binary16 (or half precision float) type.
	//
	// MSB LSB
	// ╔════╦════╤════╤════╤════╤════╦════╤════╤════╤════╤════╤════╤════╤════╤════╤════╗
	// ║Sign║ E₄ │ E₃ │ E₂ │ E₁ │ E₀ ║ F₉ │ F₈ │ F₇ │ F₆ │ F₅ │ F₄ │ F₃ │ F₂ │ F₁ │ F₀ ║
	// ╚════╩════╧════╧════╧════╧════╩════╧════╧════╧════╧════╧════╧════╧════╧════╧════╝
	// Where E is the exponent bits and F is the fractional bits.
	type Float16 uint16

	const (
	float16ExpMask Float16 = 0x7c00
	float16ExpBias uint32 = 0xf
	float16ExpShift uint32 = 10
	float16FracMask Float16 = 0x03ff
	float16SignMask Float16 = 0x8000
	float32ExpMask uint32 = 0x7f800000
	float32ExpBias uint32 = 0x7f
	float32ExpShift uint32 = 23
	float32FracMask uint32 = 0x007fffff
	)

	// Float32 returns the Float16 value expanded to a float32. Infinities and NaNs are expanded as
	// such.
	func (f Float16) Float32() float32 {
	u32 := expandF16ToF32(f)
	ptr := unsafe.Pointer(&u32)
	f32 := (float32)(ptr)
	return f32
	}

	// IsNaN reports whether f is an “not-a-number” value.
	func (f Float16) IsNaN() bool { return (f&float16ExpMask == float16ExpMask) && (f&float16FracMask != 0) }

	// IsInf reports whether f is an infinity, according to sign. If sign > 0, IsInf reports whether
	// f is positive infinity. If sign < 0, IsInf reports whether f is negative infinity. If sign ==
	// 0, IsInf reports whether f is either infinity.
	func (f Float16) IsInf(sign int) bool {
	return ((f == float16ExpMask) && sign >= 0) \|\|
	(f == (float16SignMask\|float16ExpMask) && sign <= 0)
	}

	// Float16NaN returns an “not-a-number” value.
	func NewFloat16NaN() Float16 { return float16ExpMask \| float16FracMask }

	// Float16Inf returns positive infinity if sign >= 0, negative infinity if sign < 0.
	func NewFloat16Inf(sign int) Float16 {
	if sign >= 0 {
	return float16ExpMask
	} else {
	return float16SignMask \| float16ExpMask
	}
	}

	// NewFloat16 returns a Float16 encoding of a 32-bit floating point number. Infinities and NaNs
	// are encoded as such. Very large and very small numbers get rounded to infinity and zero
	// respectively.
	func NewFloat16(f32 float32) Float16 {
	ptr := unsafe.Pointer(&f32)
	u32 := (uint32)(ptr)

	sign := Float16(u32>>16) & float16SignMask
	exp := (u32 & float32ExpMask) >> float32ExpShift
	frac := u32 & 0x7fffff

	if exp == 0xff {
	// NaN or Infinity
	if frac != 0 { // NaN
	frac = 0x3f
	}

	return sign \| float16ExpMask \| Float16(frac)
	}

	if exp+float16ExpBias <= float32ExpBias {
	// Exponent is too small to represent in a Float16 (or a zero). We need to output
	// denormalized numbers (possibly rounding very small numbers to zero).
	denorm := float32ExpBias - exp - 1
	frac += 1 << float32ExpShift
	frac >>= denorm
	return sign \| Float16(frac)
	}

	if exp > float32ExpBias+float16ExpBias {
	// Number too large to represent in a Float16 => round to Infinity.
	return sign \| float16ExpMask
	}

	// General case.
	return sign \| Float16(((exp+float16ExpBias-float32ExpBias)<<float16ExpShift)\|(frac>>13))
	}

	func expandF16ToF32(in Float16) uint32 {
	sign := uint32(in&float16SignMask) << 16
	frac := uint32(in&float16FracMask) << 13
	exp := uint32(in&float16ExpMask) >> float16ExpShift

	if exp == 0x1f {
	// NaN of Infinity
	return sign \| float32ExpMask \| frac
	}

	if exp == 0 {
	if frac == 0 {
	// Zero
	return sign
	}
	// Denormalized number. In a float32 it must be stored in a normalized form, so
	// we normalize it.
	exp++
	for frac&float32ExpMask == 0 {
	frac <<= 1
	exp--
	}
	frac &= float32FracMask
	}

	exp += (float32ExpBias - float16ExpBias)

	return sign \| (exp << float32ExpShift) \| frac
	}