Source code for gfloat.round

# Copyright (c) 2024 Graphcore Ltd. All rights reserved.

import math

import numpy as np

from .types import FormatInfo, RoundMode


def _isodd(v: int) -> bool:
    return v & 0x1 == 1


[docs] def round_float( fi: FormatInfo, v: float, rnd: RoundMode = RoundMode.TiesToEven, sat: bool = False ) -> float: """ Round input to the given :py:class:`FormatInfo`, given rounding mode and saturation flag An input NaN will convert to a NaN in the target. An input Infinity will convert to the largest float if :paramref:`sat`, otherwise to an Inf, if present, otherwise to a NaN. Negative zero will be returned if the format has negative zero, otherwise zero. Args: fi (FormatInfo): Describes the target format v (float): Input value to be rounded rnd (RoundMode): Rounding mode to use sat (bool): Saturation flag: if True, round overflowed values to `fi.max` Returns: A float which is one of the values in the format. Raises: ValueError: The target format cannot represent the input (e.g. converting a `NaN`, or an `Inf` when the target has no `NaN` or `Inf`, and :paramref:`sat` is false) """ # Constants p = fi.precision bias = fi.expBias t = p - 1 if np.isnan(v): if fi.num_nans == 0: raise ValueError(f"No NaN in format {fi}") # Note that this does not preserve the NaN payload return np.nan # Extract sign sign = np.signbit(v) and fi.is_signed vpos = -v if sign else v if np.isinf(vpos): result = np.inf elif fi.has_subnormals and vpos < fi.smallest_subnormal / 2: # Test against smallest_subnormal to avoid subnormals in frexp below # Note that this restricts us to types narrower than float64 result = 0.0 else: # Extract significand (mantissa) and exponent fsignificand, expval = np.frexp(vpos) assert fsignificand >= 0.5 and fsignificand < 1.0 # Bring significand into range [1.0, 2.0) fsignificand *= 2 expval -= 1 # Effective precision, accounting for right shift for subnormal values biased_exp = expval + bias if fi.has_subnormals: effective_precision = t + min(biased_exp - 1, 0) else: effective_precision = t # Lift to "integer * 2^e" fsignificand *= 2.0**effective_precision expval -= effective_precision # Round isignificand = math.floor(fsignificand) if isignificand != fsignificand: # Need to round if rnd == RoundMode.TowardZero: pass elif rnd == RoundMode.TowardPositive: isignificand += 1 if not sign else 0 elif rnd == RoundMode.TowardNegative: isignificand += 1 if sign else 0 else: # Round to nearest d = fsignificand - isignificand if d > 0.5: isignificand += 1 elif d == 0.5: # Tie if rnd == RoundMode.TiesToAway: isignificand += 1 else: # All other modes tie to even if fi.precision == 1: # No significand bits assert (isignificand == 1) or (isignificand == 0) if _isodd(biased_exp): expval += 1 else: if _isodd(isignificand): isignificand += 1 result = isignificand * (2.0**expval) if result == 0: if sign and fi.has_nz: return -0.0 else: return 0.0 # Overflow if result > (-fi.min if sign else fi.max): if sat: result = fi.max else: if fi.has_infs: result = np.inf elif fi.num_nans > 0: result = np.nan else: raise ValueError(f"No Infs or NaNs in format {fi}, and sat=False") # Set sign if sign: result = -result return result
[docs] def encode_float(fi: FormatInfo, v: float) -> int: """ Encode input to the given :py:class:`FormatInfo`. Will round toward zero if :paramref:`v` is not in the value set. Will saturate to `Inf`, `NaN`, `fi.max` in order of precedence. Encode -0 to 0 if not `fi.has_nz` For other roundings and saturations, call :func:`round_float` first. Args: fi (FormatInfo): Describes the target format v (float): The value to be encoded. Returns: The integer code point """ # Format Constants k = fi.bits p = fi.precision t = p - 1 # Encode if np.isnan(v): return fi.code_of_nan # Overflow/underflow if v > fi.max: if fi.has_infs: return fi.code_of_posinf if fi.num_nans > 0: return fi.code_of_nan return fi.code_of_max if v < fi.min: if fi.has_infs: return fi.code_of_neginf if fi.num_nans > 0: return fi.code_of_nan return fi.code_of_min # Finite values sign = fi.is_signed and np.signbit(v) vpos = -v if sign else v if fi.has_subnormals and vpos <= fi.smallest_subnormal / 2: isig = 0 biased_exp = 0 else: assert fi.bits < 64 # TODO: check implementation if fi is binary64 sig, exp = np.frexp(vpos) # sig in range [0.5, 1) sig *= 2 exp -= 1 # now sig in range [1, 2) biased_exp = exp + fi.expBias if biased_exp < 1 and fi.has_subnormals: # subnormal sig *= 2.0 ** (biased_exp - 1) biased_exp = 0 assert vpos == sig * 2 ** (1 - fi.expBias) else: if sig > 0: sig -= 1.0 isig = math.floor(sig * 2**t) # Zero if isig == 0 and biased_exp == 0 and fi.has_zero: if sign and fi.has_nz: return fi.code_of_negzero else: return fi.code_of_zero # Nonzero assert isig < 2**t assert biased_exp < 2**fi.expBits or fi.is_twos_complement # Handle two's complement encoding if fi.is_twos_complement and sign: isig = (1 << t) - isig # Pack values into a single integer ival = (sign << (k - 1)) | (biased_exp << t) | (isig << 0) return ival