gcc-4.3.1/libgcc/config/libbid/bid128_add.c - toolchain/gcc - Git at Google

 /* Copyright (C) 2007  Free Software Foundation, Inc.

 This file is part of GCC.

 GCC is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
 Software Foundation; either version 2, or (at your option) any later
 version.

 In addition to the permissions in the GNU General Public License, the
 Free Software Foundation gives you unlimited permission to link the
 compiled version of this file into combinations with other programs,
 and to distribute those combinations without any restriction coming
 from the use of this file.  (The General Public License restrictions
 do apply in other respects; for example, they cover modification of
 the file, and distribution when not linked into a combine
 executable.)

 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or
 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 You should have received a copy of the GNU General Public License
 along with GCC; see the file COPYING.  If not, write to the Free
 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
 02110-1301, USA.  */

 #include "bid_internal.h"


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid64dq_add (UINT64 * pres, UINT64 * px, UINT128 * py
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
   UINT64 x = *px;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT64
 bid64dq_add (UINT64 x, UINT128 y
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
 #endif
   UINT64 res = 0xbaddbaddbaddbaddull;
   UINT128 x1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid64qq_add (&res, &x1, py
 	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	       _EXC_INFO_ARG);
 #else
   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid64qq_add (x1, y
 		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		     _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid64qd_add (UINT64 * pres, UINT128 * px, UINT64 * py
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
   UINT64 y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT64
 bid64qd_add (UINT128 x, UINT64 y
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
 #endif
   UINT64 res = 0xbaddbaddbaddbaddull;
   UINT128 y1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid64qq_add (&res, px, &y1
 	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	       _EXC_INFO_ARG);
 #else
   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid64qq_add (x, y1
 		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		     _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid64qq_add (UINT64 * pres, UINT128 * px, UINT128 * py
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
   UINT128 x = *px, y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT64
 bid64qq_add (UINT128 x, UINT128 y
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
 #endif

   UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
   };
   UINT64 res = 0xbaddbaddbaddbaddull;

   BID_SWAP128 (one);
 #if DECIMAL_CALL_BY_REFERENCE
   bid64qqq_fma (&res, &one, &x, &y
 		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		_EXC_INFO_ARG);
 #else
   res = bid64qqq_fma (one, x, y
 		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		      _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid128dd_add (UINT128 * pres, UINT64 * px, UINT64 * py
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
   UINT64 x = *px, y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT128
 bid128dd_add (UINT64 x, UINT64 y
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
 #endif
   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
   };
   UINT128 x1, y1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid128_add (&res, &x1, &y1
 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	      _EXC_INFO_ARG);
 #else
   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid128_add (x1, y1
 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		    _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid128dq_add (UINT128 * pres, UINT64 * px, UINT128 * py
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
   UINT64 x = *px;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT128
 bid128dq_add (UINT64 x, UINT128 y
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
 #endif
   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
   };
   UINT128 x1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid128_add (&res, &x1, py
 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	      _EXC_INFO_ARG);
 #else
   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid128_add (x1, y
 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		    _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid128qd_add (UINT128 * pres, UINT128 * px, UINT64 * py
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
   UINT64 y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT128
 bid128qd_add (UINT128 x, UINT64 y
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
 #endif
   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
   };
   UINT128 y1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid128_add (&res, px, &y1
 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	      _EXC_INFO_ARG);
 #else
   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid128_add (x, y1
 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		    _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 // bid128_add stands for bid128qq_add


 /*****************************************************************************
  *  BID64/BID128 sub
  ****************************************************************************/

 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid64dq_sub (UINT64 * pres, UINT64 * px, UINT128 * py
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
   UINT64 x = *px;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT64
 bid64dq_sub (UINT64 x, UINT128 y
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
 #endif
   UINT64 res = 0xbaddbaddbaddbaddull;
   UINT128 x1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid64qq_sub (&res, &x1, py
 	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	       _EXC_INFO_ARG);
 #else
   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid64qq_sub (x1, y
 		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		     _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid64qd_sub (UINT64 * pres, UINT128 * px, UINT64 * py
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
   UINT64 y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT64
 bid64qd_sub (UINT128 x, UINT64 y
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
 #endif
   UINT64 res = 0xbaddbaddbaddbaddull;
   UINT128 y1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid64qq_sub (&res, px, &y1
 	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	       _EXC_INFO_ARG);
 #else
   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid64qq_sub (x, y1
 		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		     _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid64qq_sub (UINT64 * pres, UINT128 * px, UINT128 * py
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
   UINT128 x = *px, y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT64
 bid64qq_sub (UINT128 x, UINT128 y
 	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	     _EXC_INFO_PARAM) {
 #endif

   UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
   };
   UINT64 res = 0xbaddbaddbaddbaddull;
   UINT64 y_sign;

   BID_SWAP128 (one);
   if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
     // change its sign
     y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
     if (y_sign)
       y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
     else
       y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
   }
 #if DECIMAL_CALL_BY_REFERENCE
   bid64qqq_fma (&res, &one, &x, &y
 		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		_EXC_INFO_ARG);
 #else
   res = bid64qqq_fma (one, x, y
 		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		      _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid128dd_sub (UINT128 * pres, UINT64 * px, UINT64 * py
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
   UINT64 x = *px, y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT128
 bid128dd_sub (UINT64 x, UINT64 y
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
 #endif
   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
   };
   UINT128 x1, y1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid128_sub (&res, &x1, &y1
 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	      _EXC_INFO_ARG);
 #else
   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid128_sub (x1, y1
 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		    _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid128dq_sub (UINT128 * pres, UINT64 * px, UINT128 * py
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
   UINT64 x = *px;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT128
 bid128dq_sub (UINT64 x, UINT128 y
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
 #endif
   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
   };
   UINT128 x1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid128_sub (&res, &x1, py
 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	      _EXC_INFO_ARG);
 #else
   x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid128_sub (x1, y
 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		    _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }


 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid128qd_sub (UINT128 * pres, UINT128 * px, UINT64 * py
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
   UINT64 y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT128
 bid128qd_sub (UINT128 x, UINT64 y
 	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	      _EXC_INFO_PARAM) {
 #endif
   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
   };
   UINT128 y1;

 #if DECIMAL_CALL_BY_REFERENCE
   bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   bid128_sub (&res, px, &y1
 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	      _EXC_INFO_ARG);
 #else
   y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
   res = bid128_sub (x, y1
 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		    _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }

 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid128_add (UINT128 * pres, UINT128 * px, UINT128 * py
 	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	    _EXC_INFO_PARAM) {
   UINT128 x = *px, y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT128
 bid128_add (UINT128 x, UINT128 y
 	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	    _EXC_INFO_PARAM) {
 #endif
   UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
   };
   UINT64 x_sign, y_sign, tmp_sign;
   UINT64 x_exp, y_exp, tmp_exp;	// e1 = x_exp, e2 = y_exp
   UINT64 C1_hi, C2_hi, tmp_signif_hi;
   UINT64 C1_lo, C2_lo, tmp_signif_lo;
   // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all UINT64)
   // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all UINT64)
   UINT64 tmp64, tmp64A, tmp64B;
   BID_UI64DOUBLE tmp1, tmp2;
   int x_nr_bits, y_nr_bits;
   int q1, q2, delta, scale, x1, ind, shift, tmp_inexact = 0;
   UINT64 halfulp64;
   UINT128 halfulp128;
   UINT128 C1, C2;
   UINT128 ten2m1;
   UINT128 highf2star;		// top 128 bits in f2*; low 128 bits in R256[1], R256[0]
   UINT256 P256, Q256, R256;
   int is_inexact = 0, is_midpoint_lt_even = 0, is_midpoint_gt_even = 0;
   int is_inexact_lt_midpoint = 0, is_inexact_gt_midpoint = 0;
   int second_pass = 0;

   BID_SWAP128 (x);
   BID_SWAP128 (y);
   x_sign = x.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
   y_sign = y.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative

   // check for NaN or Infinity
   if (((x.w[1] & MASK_SPECIAL) == MASK_SPECIAL)
       || ((y.w[1] & MASK_SPECIAL) == MASK_SPECIAL)) {
     // x is special or y is special
     if ((x.w[1] & MASK_NAN) == MASK_NAN) {	// x is NAN
       // check first for non-canonical NaN payload
       if (((x.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
 	  (((x.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
 	   && (x.w[0] > 0x38c15b09ffffffffull))) {
 	x.w[1] = x.w[1] & 0xffffc00000000000ull;
 	x.w[0] = 0x0ull;
       }
       if ((x.w[1] & MASK_SNAN) == MASK_SNAN) {	// x is SNAN
 	// set invalid flag
 	*pfpsf |= INVALID_EXCEPTION;
 	// return quiet (x)
 	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
 	// clear out also G[6]-G[16]
 	res.w[0] = x.w[0];
       } else {	// x is QNaN
 	// return x
 	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
 	// clear out G[6]-G[16]
 	res.w[0] = x.w[0];
 	// if y = SNaN signal invalid exception
 	if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {
 	  // set invalid flag
 	  *pfpsf |= INVALID_EXCEPTION;
 	}
       }
       BID_SWAP128 (res);
       BID_RETURN (res);
     } else if ((y.w[1] & MASK_NAN) == MASK_NAN) {	// y is NAN
       // check first for non-canonical NaN payload
       if (((y.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
 	  (((y.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
 	   && (y.w[0] > 0x38c15b09ffffffffull))) {
 	y.w[1] = y.w[1] & 0xffffc00000000000ull;
 	y.w[0] = 0x0ull;
       }
       if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {	// y is SNAN
 	// set invalid flag
 	*pfpsf |= INVALID_EXCEPTION;
 	// return quiet (y)
 	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
 	// clear out also G[6]-G[16]
 	res.w[0] = y.w[0];
       } else {	// y is QNaN
 	// return y
 	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
 	// clear out G[6]-G[16]
 	res.w[0] = y.w[0];
       }
       BID_SWAP128 (res);
       BID_RETURN (res);
     } else {	// neither x not y is NaN; at least one is infinity
       if ((x.w[1] & MASK_ANY_INF) == MASK_INF) {	// x is infinity
 	if ((y.w[1] & MASK_ANY_INF) == MASK_INF) {	// y is infinity
 	  // if same sign, return either of them
 	  if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
 	    res.w[1] = x_sign | MASK_INF;
 	    res.w[0] = 0x0ull;
 	  } else {	// x and y are infinities of opposite signs
 	    // set invalid flag
 	    *pfpsf |= INVALID_EXCEPTION;
 	    // return QNaN Indefinite
 	    res.w[1] = 0x7c00000000000000ull;
 	    res.w[0] = 0x0000000000000000ull;
 	  }
 	} else {	// y is 0 or finite
 	  // return x
 	  res.w[1] = x_sign | MASK_INF;
 	  res.w[0] = 0x0ull;
 	}
       } else {	// x is not NaN or infinity, so y must be infinity
 	res.w[1] = y_sign | MASK_INF;
 	res.w[0] = 0x0ull;
       }
       BID_SWAP128 (res);
       BID_RETURN (res);
     }
   }
   // unpack the arguments

   // unpack x
   C1_hi = x.w[1] & MASK_COEFF;
   C1_lo = x.w[0];
   // test for non-canonical values:
   // - values whose encoding begins with x00, x01, or x10 and whose
   //   coefficient is larger than 10^34 -1, or
   // - values whose encoding begins with x1100, x1101, x1110 (if NaNs
   //   and infinitis were eliminated already this test is reduced to
   //   checking for x10x)

   // x is not infinity; check for non-canonical values - treated as zero
   if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
     // G0_G1=11; non-canonical
     x_exp = (x.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
     C1_hi = 0;	// significand high
     C1_lo = 0;	// significand low
   } else {	// G0_G1 != 11
     x_exp = x.w[1] & MASK_EXP;	// biased and shifted left 49 bits
     if (C1_hi > 0x0001ed09bead87c0ull ||
 	(C1_hi == 0x0001ed09bead87c0ull
 	 && C1_lo > 0x378d8e63ffffffffull)) {
       // x is non-canonical if coefficient is larger than 10^34 -1
       C1_hi = 0;
       C1_lo = 0;
     } else {	// canonical
       ;
     }
   }

   // unpack y
   C2_hi = y.w[1] & MASK_COEFF;
   C2_lo = y.w[0];
   // y is not infinity; check for non-canonical values - treated as zero
   if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
     // G0_G1=11; non-canonical
     y_exp = (y.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
     C2_hi = 0;	// significand high
     C2_lo = 0;	// significand low
   } else {	// G0_G1 != 11
     y_exp = y.w[1] & MASK_EXP;	// biased and shifted left 49 bits
     if (C2_hi > 0x0001ed09bead87c0ull ||
 	(C2_hi == 0x0001ed09bead87c0ull
 	 && C2_lo > 0x378d8e63ffffffffull)) {
       // y is non-canonical if coefficient is larger than 10^34 -1
       C2_hi = 0;
       C2_lo = 0;
     } else {	// canonical
       ;
     }
   }

   if ((C1_hi == 0x0ull) && (C1_lo == 0x0ull)) {
     // x is 0 and y is not special
     // if y is 0 return 0 with the smaller exponent
     if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
       if (x_exp < y_exp)
 	res.w[1] = x_exp;
       else
 	res.w[1] = y_exp;
       if (x_sign && y_sign)
 	res.w[1] = res.w[1] | x_sign;	// both negative
       else if (rnd_mode == ROUNDING_DOWN && x_sign != y_sign)
 	res.w[1] = res.w[1] | 0x8000000000000000ull;	// -0
       // else; // res = +0
       res.w[0] = 0;
     } else {
       // for 0 + y return y, with the preferred exponent
       if (y_exp <= x_exp) {
 	res.w[1] = y.w[1];
 	res.w[0] = y.w[0];
       } else {	// if y_exp > x_exp
 	// return (C2 * 10^scale) * 10^(y_exp - scale)
 	// where scale = min (P34-q2, y_exp-x_exp)
 	// determine q2 = nr. of decimal digits in y
 	//  determine first the nr. of bits in y (y_nr_bits)

 	if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
 	  if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
 	    // split the 64-bit value in two 32-bit halves to avoid
 	    // rounding errors
 	    if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
 	      tmp2.d = (double) (C2_lo >> 32);	// exact conversion
 	      y_nr_bits =
 		32 +
 		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	    } else {	// y < 2^32
 	      tmp2.d = (double) (C2_lo);	// exact conversion
 	      y_nr_bits =
 		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	    }
 	  } else {	// if y < 2^53
 	    tmp2.d = (double) C2_lo;	// exact conversion
 	    y_nr_bits =
 	      ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	  }
 	} else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
 	  tmp2.d = (double) C2_hi;	// exact conversion
 	  y_nr_bits =
 	    64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	}
 	q2 = nr_digits[y_nr_bits].digits;
 	if (q2 == 0) {
 	  q2 = nr_digits[y_nr_bits].digits1;
 	  if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
 	      (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
 	       C2_lo >= nr_digits[y_nr_bits].threshold_lo))
 	    q2++;
 	}
 	// return (C2 * 10^scale) * 10^(y_exp - scale)
 	// where scale = min (P34-q2, y_exp-x_exp)
 	scale = P34 - q2;
 	ind = (y_exp - x_exp) >> 49;
 	if (ind < scale)
 	  scale = ind;
 	if (scale == 0) {
 	  res.w[1] = y.w[1];
 	  res.w[0] = y.w[0];
 	} else if (q2 <= 19) {	// y fits in 64 bits
 	  if (scale <= 19) {	// 10^scale fits in 64 bits
 	    // 64 x 64 C2_lo * ten2k64[scale]
 	    __mul_64x64_to_128MACH (res, C2_lo, ten2k64[scale]);
 	  } else {	// 10^scale fits in 128 bits
 	    // 64 x 128 C2_lo * ten2k128[scale - 20]
 	    __mul_128x64_to_128 (res, C2_lo, ten2k128[scale - 20]);
 	  }
 	} else {	// y fits in 128 bits, but 10^scale must fit in 64 bits
 	  // 64 x 128 ten2k64[scale] * C2
 	  C2.w[1] = C2_hi;
 	  C2.w[0] = C2_lo;
 	  __mul_128x64_to_128 (res, ten2k64[scale], C2);
 	}
 	// subtract scale from the exponent
 	y_exp = y_exp - ((UINT64) scale << 49);
 	res.w[1] = res.w[1] | y_sign | y_exp;
       }
     }
     BID_SWAP128 (res);
     BID_RETURN (res);
   } else if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
     // y is 0 and x is not special, and not zero
     // for x + 0 return x, with the preferred exponent
     if (x_exp <= y_exp) {
       res.w[1] = x.w[1];
       res.w[0] = x.w[0];
     } else {	// if x_exp > y_exp
       // return (C1 * 10^scale) * 10^(x_exp - scale)
       // where scale = min (P34-q1, x_exp-y_exp)
       // determine q1 = nr. of decimal digits in x
       //  determine first the nr. of bits in x
       if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
 	if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
 	  // split the 64-bit value in two 32-bit halves to avoid
 	  // rounding errors
 	  if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
 	    tmp1.d = (double) (C1_lo >> 32);	// exact conversion
 	    x_nr_bits =
 	      32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
 		    0x3ff);
 	  } else {	// x < 2^32
 	    tmp1.d = (double) (C1_lo);	// exact conversion
 	    x_nr_bits =
 	      ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	  }
 	} else {	// if x < 2^53
 	  tmp1.d = (double) C1_lo;	// exact conversion
 	  x_nr_bits =
 	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	}
       } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
 	tmp1.d = (double) C1_hi;	// exact conversion
 	x_nr_bits =
 	  64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
       }
       q1 = nr_digits[x_nr_bits].digits;
       if (q1 == 0) {
 	q1 = nr_digits[x_nr_bits].digits1;
 	if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
 	    (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
 	     C1_lo >= nr_digits[x_nr_bits].threshold_lo))
 	  q1++;
       }
       // return (C1 * 10^scale) * 10^(x_exp - scale)
       // where scale = min (P34-q1, x_exp-y_exp)
       scale = P34 - q1;
       ind = (x_exp - y_exp) >> 49;
       if (ind < scale)
 	scale = ind;
       if (scale == 0) {
 	res.w[1] = x.w[1];
 	res.w[0] = x.w[0];
       } else if (q1 <= 19) {	// x fits in 64 bits
 	if (scale <= 19) {	// 10^scale fits in 64 bits
 	  // 64 x 64 C1_lo * ten2k64[scale]
 	  __mul_64x64_to_128MACH (res, C1_lo, ten2k64[scale]);
 	} else {	// 10^scale fits in 128 bits
 	  // 64 x 128 C1_lo * ten2k128[scale - 20]
 	  __mul_128x64_to_128 (res, C1_lo, ten2k128[scale - 20]);
 	}
       } else {	// x fits in 128 bits, but 10^scale must fit in 64 bits
 	// 64 x 128 ten2k64[scale] * C1
 	C1.w[1] = C1_hi;
 	C1.w[0] = C1_lo;
 	__mul_128x64_to_128 (res, ten2k64[scale], C1);
       }
       // subtract scale from the exponent
       x_exp = x_exp - ((UINT64) scale << 49);
       res.w[1] = res.w[1] | x_sign | x_exp;
     }
     BID_SWAP128 (res);
     BID_RETURN (res);
   } else {	// x and y are not canonical, not special, and are not zero
     // note that the result may still be zero, and then it has to have the
     // preferred exponent
     if (x_exp < y_exp) {	// if exp_x < exp_y then swap x and y
       tmp_sign = x_sign;
       tmp_exp = x_exp;
       tmp_signif_hi = C1_hi;
       tmp_signif_lo = C1_lo;
       x_sign = y_sign;
       x_exp = y_exp;
       C1_hi = C2_hi;
       C1_lo = C2_lo;
       y_sign = tmp_sign;
       y_exp = tmp_exp;
       C2_hi = tmp_signif_hi;
       C2_lo = tmp_signif_lo;
     }
     // q1 = nr. of decimal digits in x
     //  determine first the nr. of bits in x
     if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
       if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
 	//split the 64-bit value in two 32-bit halves to avoid rounding errors
 	if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
 	  tmp1.d = (double) (C1_lo >> 32);	// exact conversion
 	  x_nr_bits =
 	    32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	} else {	// x < 2^32
 	  tmp1.d = (double) (C1_lo);	// exact conversion
 	  x_nr_bits =
 	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	}
       } else {	// if x < 2^53
 	tmp1.d = (double) C1_lo;	// exact conversion
 	x_nr_bits =
 	  ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
       }
     } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
       tmp1.d = (double) C1_hi;	// exact conversion
       x_nr_bits =
 	64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
     }

     q1 = nr_digits[x_nr_bits].digits;
     if (q1 == 0) {
       q1 = nr_digits[x_nr_bits].digits1;
       if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
 	  (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
 	   C1_lo >= nr_digits[x_nr_bits].threshold_lo))
 	q1++;
     }
     // q2 = nr. of decimal digits in y
     //  determine first the nr. of bits in y (y_nr_bits)
     if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
       if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
 	//split the 64-bit value in two 32-bit halves to avoid rounding errors
 	if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
 	  tmp2.d = (double) (C2_lo >> 32);	// exact conversion
 	  y_nr_bits =
 	    32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	} else {	// y < 2^32
 	  tmp2.d = (double) (C2_lo);	// exact conversion
 	  y_nr_bits =
 	    ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 	}
       } else {	// if y < 2^53
 	tmp2.d = (double) C2_lo;	// exact conversion
 	y_nr_bits =
 	  ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
       }
     } else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
       tmp2.d = (double) C2_hi;	// exact conversion
       y_nr_bits =
 	64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     }

     q2 = nr_digits[y_nr_bits].digits;
     if (q2 == 0) {
       q2 = nr_digits[y_nr_bits].digits1;
       if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
 	  (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
 	   C2_lo >= nr_digits[y_nr_bits].threshold_lo))
 	q2++;
     }

     delta = q1 + (int) (x_exp >> 49) - q2 - (int) (y_exp >> 49);

     if (delta >= P34) {
       // round the result directly because 0 < C2 < ulp (C1 * 10^(x_exp-e2))
       // n = C1 * 10^e1 or n = C1 +/- 10^(q1-P34)) * 10^e1
       // the result is inexact; the preferred exponent is the least possible

       if (delta >= P34 + 1) {
 	// for RN the result is the operand with the larger magnitude,
 	// possibly scaled up by 10^(P34-q1)
 	// an overflow cannot occur in this case (rounding to nearest)
 	if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
 	  // Note: because delta >= P34+1 it is certain that
 	  //     x_exp - ((UINT64)scale << 49) will stay above e_min
 	  scale = P34 - q1;
 	  if (q1 <= 19) {	// C1 fits in 64 bits
 	    // 1 <= q1 <= 19 => 15 <= scale <= 33
 	    if (scale <= 19) {	// 10^scale fits in 64 bits
 	      __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
 	    } else {	// if 20 <= scale <= 33
 	      // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 	      // (C1 * 10^(scale-19)) fits in 64 bits
 	      C1_lo = C1_lo * ten2k64[scale - 19];
 	      __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
 	    }
 	  } else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 	    // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 	    C1.w[1] = C1_hi;
 	    C1.w[0] = C1_lo;
 	    // C1 = ten2k64[P34 - q1] * C1
 	    __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
 	  }
 	  x_exp = x_exp - ((UINT64) scale << 49);
 	  C1_hi = C1.w[1];
 	  C1_lo = C1.w[0];
 	}
 	// some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1)
 	// (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) =>
 	// subtract 1 ulp
 	// Note: do this only for rounding to nearest; for other rounding
 	// modes the correction will be applied next
 	if ((rnd_mode == ROUNDING_TO_NEAREST
 	     || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
 	    && C1_hi == 0x0000314dc6448d93ull
 	    && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
 	    && ((q2 <= 19 && C2_lo > midpoint64[q2 - 1]) || (q2 >= 20
 							     && (C2_hi >
 								 midpoint128
 								 [q2 -
 								  20].
 								 w[1]
 								 ||
 								 (C2_hi
 								  ==
 								  midpoint128
 								  [q2 -
 								   20].
 								  w[1]
 								  &&
 								  C2_lo
 								  >
 								  midpoint128
 								  [q2 -
 								   20].
 								  w
 								  [0])))))
 	{
 	  // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
 	  C1_hi = 0x0001ed09bead87c0ull;
 	  C1_lo = 0x378d8e63ffffffffull;
 	  x_exp = x_exp - EXP_P1;
 	}
 	if (rnd_mode != ROUNDING_TO_NEAREST) {
 	  if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
 	      (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
 	    // add 1 ulp and then check for overflow
 	    C1_lo = C1_lo + 1;
 	    if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 	      C1_hi = C1_hi + 1;
 	    }
 	    if (C1_hi == 0x0001ed09bead87c0ull
 		&& C1_lo == 0x378d8e6400000000ull) {
 	      // C1 = 10^34 => rounding overflow
 	      C1_hi = 0x0000314dc6448d93ull;
 	      C1_lo = 0x38c15b0a00000000ull;	// 10^33
 	      x_exp = x_exp + EXP_P1;
 	      if (x_exp == EXP_MAX_P1) {	// overflow
 		C1_hi = 0x7800000000000000ull;	// +inf
 		C1_lo = 0x0ull;
 		x_exp = 0;	// x_sign is preserved
 		// set overflow flag (the inexact flag was set too)
 		*pfpsf |= OVERFLOW_EXCEPTION;
 	      }
 	    }
 	  } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) ||
 		     (rnd_mode == ROUNDING_UP && x_sign && !y_sign) ||
 		     (rnd_mode == ROUNDING_TO_ZERO
 		      && x_sign != y_sign)) {
 	    // subtract 1 ulp from C1
 	    // Note: because delta >= P34 + 1 the result cannot be zero
 	    C1_lo = C1_lo - 1;
 	    if (C1_lo == 0xffffffffffffffffull)
 	      C1_hi = C1_hi - 1;
 	    // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and
 	    // decrease the exponent by 1 (because delta >= P34 + 1 the
 	    // exponent will not become less than e_min)
 	    // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 	    // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 	    if (C1_hi == 0x0000314dc6448d93ull
 		&& C1_lo == 0x38c15b09ffffffffull) {
 	      // make C1 = 10^34  - 1
 	      C1_hi = 0x0001ed09bead87c0ull;
 	      C1_lo = 0x378d8e63ffffffffull;
 	      x_exp = x_exp - EXP_P1;
 	    }
 	  } else {
 	    ;	// the result is already correct
 	  }
 	}
 	// set the inexact flag
 	*pfpsf |= INEXACT_EXCEPTION;
 	// assemble the result
 	res.w[1] = x_sign | x_exp | C1_hi;
 	res.w[0] = C1_lo;
       } else {	// delta = P34
 	// in most cases, the smaller operand may be < or = or > 1/2 ulp of the
 	// larger operand
 	// however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
 	// to accuracy loss after subtraction, and will be treated separately
 	if (x_sign == y_sign || (q1 <= 20
 				 && (C1_hi != 0
 				     || C1_lo != ten2k64[q1 - 1]))
 	    || (q1 >= 21 && (C1_hi != ten2k128[q1 - 21].w[1]
 			     || C1_lo != ten2k128[q1 - 21].w[0]))) {
 	  // if x_sign == y_sign or C1 != 10^(q1-1)
 	  // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
 	  // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
 	  if (q2 <= 19) {	// C2 and 5*10^(q2-1) both fit in 64 bits
 	    halfulp64 = midpoint64[q2 - 1];	// 5 * 10^(q2-1)
 	    if (C2_lo < halfulp64) {	// n2 < 1/2 ulp (n1)
 	      // for RN the result is the operand with the larger magnitude,
 	      // possibly scaled up by 10^(P34-q1)
 	      // an overflow cannot occur in this case (rounding to nearest)
 	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
 		// Note: because delta = P34 it is certain that
 		//     x_exp - ((UINT64)scale << 49) will stay above e_min
 		scale = P34 - q1;
 		if (q1 <= 19) {	// C1 fits in 64 bits
 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
 		  if (scale <= 19) {	// 10^scale fits in 64 bits
 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
 		  } else {	// if 20 <= scale <= 33
 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 		    // (C1 * 10^(scale-19)) fits in 64 bits
 		    C1_lo = C1_lo * ten2k64[scale - 19];
 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
 		  }
 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 		  C1.w[1] = C1_hi;
 		  C1.w[0] = C1_lo;
 		  // C1 = ten2k64[P34 - q1] * C1
 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
 		}
 		x_exp = x_exp - ((UINT64) scale << 49);
 		C1_hi = C1.w[1];
 		C1_lo = C1.w[0];
 	      }
 	      if (rnd_mode != ROUNDING_TO_NEAREST) {
 		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
 		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
 		  // add 1 ulp and then check for overflow
 		  C1_lo = C1_lo + 1;
 		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		    C1_hi = C1_hi + 1;
 		  }
 		  if (C1_hi == 0x0001ed09bead87c0ull
 		      && C1_lo == 0x378d8e6400000000ull) {
 		    // C1 = 10^34 => rounding overflow
 		    C1_hi = 0x0000314dc6448d93ull;
 		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		    x_exp = x_exp + EXP_P1;
 		    if (x_exp == EXP_MAX_P1) {	// overflow
 		      C1_hi = 0x7800000000000000ull;	// +inf
 		      C1_lo = 0x0ull;
 		      x_exp = 0;	// x_sign is preserved
 		      // set overflow flag (the inexact flag was set too)
 		      *pfpsf |= OVERFLOW_EXCEPTION;
 		    }
 		  }
 		} else
 		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
 		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
 		      || (rnd_mode == ROUNDING_TO_ZERO
 			  && x_sign != y_sign)) {
 		  // subtract 1 ulp from C1
 		  // Note: because delta >= P34 + 1 the result cannot be zero
 		  C1_lo = C1_lo - 1;
 		  if (C1_lo == 0xffffffffffffffffull)
 		    C1_hi = C1_hi - 1;
 		  // if the coefficient is 10^33-1 then make it 10^34-1 and
 		  // decrease the exponent by 1 (because delta >= P34 + 1 the
 		  // exponent will not become less than e_min)
 		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 		  if (C1_hi == 0x0000314dc6448d93ull
 		      && C1_lo == 0x38c15b09ffffffffull) {
 		    // make C1 = 10^34  - 1
 		    C1_hi = 0x0001ed09bead87c0ull;
 		    C1_lo = 0x378d8e63ffffffffull;
 		    x_exp = x_exp - EXP_P1;
 		  }
 		} else {
 		  ;	// the result is already correct
 		}
 	      }
 	      // set the inexact flag
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      // assemble the result
 	      res.w[1] = x_sign | x_exp | C1_hi;
 	      res.w[0] = C1_lo;
 	    } else if ((C2_lo == halfulp64)
 		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
 	      // n2 = 1/2 ulp (n1) and C1 is even
 	      // the result is the operand with the larger magnitude,
 	      // possibly scaled up by 10^(P34-q1)
 	      // an overflow cannot occur in this case (rounding to nearest)
 	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
 		// Note: because delta = P34 it is certain that
 		//     x_exp - ((UINT64)scale << 49) will stay above e_min
 		scale = P34 - q1;
 		if (q1 <= 19) {	// C1 fits in 64 bits
 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
 		  if (scale <= 19) {	// 10^scale fits in 64 bits
 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
 		  } else {	// if 20 <= scale <= 33
 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 		    // (C1 * 10^(scale-19)) fits in 64 bits
 		    C1_lo = C1_lo * ten2k64[scale - 19];
 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
 		  }
 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 		  C1.w[1] = C1_hi;
 		  C1.w[0] = C1_lo;
 		  // C1 = ten2k64[P34 - q1] * C1
 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
 		}
 		x_exp = x_exp - ((UINT64) scale << 49);
 		C1_hi = C1.w[1];
 		C1_lo = C1.w[0];
 	      }
 	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign
 		   && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_TIES_AWAY
 					  && x_sign == y_sign)
 		  || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)
 		  || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
 		// add 1 ulp and then check for overflow
 		C1_lo = C1_lo + 1;
 		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		  C1_hi = C1_hi + 1;
 		}
 		if (C1_hi == 0x0001ed09bead87c0ull
 		    && C1_lo == 0x378d8e6400000000ull) {
 		  // C1 = 10^34 => rounding overflow
 		  C1_hi = 0x0000314dc6448d93ull;
 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		  x_exp = x_exp + EXP_P1;
 		  if (x_exp == EXP_MAX_P1) {	// overflow
 		    C1_hi = 0x7800000000000000ull;	// +inf
 		    C1_lo = 0x0ull;
 		    x_exp = 0;	// x_sign is preserved
 		    // set overflow flag (the inexact flag was set too)
 		    *pfpsf |= OVERFLOW_EXCEPTION;
 		  }
 		}
 	      } else
 		if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign
 		     && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_DOWN
 					    && !x_sign && y_sign)
 		    || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
 		    || (rnd_mode == ROUNDING_TO_ZERO
 			&& x_sign != y_sign)) {
 		// subtract 1 ulp from C1
 		// Note: because delta >= P34 + 1 the result cannot be zero
 		C1_lo = C1_lo - 1;
 		if (C1_lo == 0xffffffffffffffffull)
 		  C1_hi = C1_hi - 1;
 		// if the coefficient is 10^33 - 1 then make it 10^34 - 1
 		// and decrease the exponent by 1 (because delta >= P34 + 1
 		// the exponent will not become less than e_min)
 		// 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 		// 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 		if (C1_hi == 0x0000314dc6448d93ull
 		    && C1_lo == 0x38c15b09ffffffffull) {
 		  // make C1 = 10^34  - 1
 		  C1_hi = 0x0001ed09bead87c0ull;
 		  C1_lo = 0x378d8e63ffffffffull;
 		  x_exp = x_exp - EXP_P1;
 		}
 	      } else {
 		;	// the result is already correct
 	      }
 	      // set the inexact flag
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      // assemble the result
 	      res.w[1] = x_sign | x_exp | C1_hi;
 	      res.w[0] = C1_lo;
 	    } else {	// if C2_lo > halfulp64 ||
 	      // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
 	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
 	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
 	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
 		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
 		// because q1 < P34 we must first replace C1 by
 		// C1 * 10^(P34-q1), and must decrease the exponent by
 		// (P34-q1) (it will still be at least e_min)
 		scale = P34 - q1;
 		if (q1 <= 19) {	// C1 fits in 64 bits
 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
 		  if (scale <= 19) {	// 10^scale fits in 64 bits
 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
 		  } else {	// if 20 <= scale <= 33
 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 		    // (C1 * 10^(scale-19)) fits in 64 bits
 		    C1_lo = C1_lo * ten2k64[scale - 19];
 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
 		  }
 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 		  C1.w[1] = C1_hi;
 		  C1.w[0] = C1_lo;
 		  // C1 = ten2k64[P34 - q1] * C1
 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
 		}
 		x_exp = x_exp - ((UINT64) scale << 49);
 		C1_hi = C1.w[1];
 		C1_lo = C1.w[0];
 		// check for rounding overflow
 		if (C1_hi == 0x0001ed09bead87c0ull
 		    && C1_lo == 0x378d8e6400000000ull) {
 		  // C1 = 10^34 => rounding overflow
 		  C1_hi = 0x0000314dc6448d93ull;
 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		  x_exp = x_exp + EXP_P1;
 		}
 	      }
 	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
 		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
 		      && C2_lo != halfulp64)
 		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
 		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
 		  || (rnd_mode == ROUNDING_TO_ZERO
 		      && x_sign != y_sign)) {
 		// the result is x - 1
 		// for RN n1 * n2 < 0; underflow not possible
 		C1_lo = C1_lo - 1;
 		if (C1_lo == 0xffffffffffffffffull)
 		  C1_hi--;
 		// check if we crossed into the lower decade
 		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
 		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
 		  C1_lo = 0x378d8e63ffffffffull;
 		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
 		}
 	      } else
 		if ((rnd_mode == ROUNDING_TO_NEAREST
 		     && x_sign == y_sign)
 		    || (rnd_mode == ROUNDING_TIES_AWAY
 			&& x_sign == y_sign)
 		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
 		    || (rnd_mode == ROUNDING_UP && !x_sign
 			&& !y_sign)) {
 		// the result is x + 1
 		// for RN x_sign = y_sign, i.e. n1*n2 > 0
 		C1_lo = C1_lo + 1;
 		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		  C1_hi = C1_hi + 1;
 		}
 		if (C1_hi == 0x0001ed09bead87c0ull
 		    && C1_lo == 0x378d8e6400000000ull) {
 		  // C1 = 10^34 => rounding overflow
 		  C1_hi = 0x0000314dc6448d93ull;
 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		  x_exp = x_exp + EXP_P1;
 		  if (x_exp == EXP_MAX_P1) {	// overflow
 		    C1_hi = 0x7800000000000000ull;	// +inf
 		    C1_lo = 0x0ull;
 		    x_exp = 0;	// x_sign is preserved
 		    // set the overflow flag
 		    *pfpsf |= OVERFLOW_EXCEPTION;
 		  }
 		}
 	      } else {
 		;	// the result is x
 	      }
 	      // set the inexact flag
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      // assemble the result
 	      res.w[1] = x_sign | x_exp | C1_hi;
 	      res.w[0] = C1_lo;
 	    }
 	  } else {	// if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in
 	    // most cases) fit only in more than 64 bits
 	    halfulp128 = midpoint128[q2 - 20];	// 5 * 10^(q2-1)
 	    if ((C2_hi < halfulp128.w[1])
 		|| (C2_hi == halfulp128.w[1]
 		    && C2_lo < halfulp128.w[0])) {
 	      // n2 < 1/2 ulp (n1)
 	      // the result is the operand with the larger magnitude,
 	      // possibly scaled up by 10^(P34-q1)
 	      // an overflow cannot occur in this case (rounding to nearest)
 	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
 		// Note: because delta = P34 it is certain that
 		//     x_exp - ((UINT64)scale << 49) will stay above e_min
 		scale = P34 - q1;
 		if (q1 <= 19) {	// C1 fits in 64 bits
 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
 		  if (scale <= 19) {	// 10^scale fits in 64 bits
 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
 		  } else {	// if 20 <= scale <= 33
 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 		    // (C1 * 10^(scale-19)) fits in 64 bits
 		    C1_lo = C1_lo * ten2k64[scale - 19];
 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
 		  }
 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 		  C1.w[1] = C1_hi;
 		  C1.w[0] = C1_lo;
 		  // C1 = ten2k64[P34 - q1] * C1
 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
 		}
 		C1_hi = C1.w[1];
 		C1_lo = C1.w[0];
 		x_exp = x_exp - ((UINT64) scale << 49);
 	      }
 	      if (rnd_mode != ROUNDING_TO_NEAREST) {
 		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
 		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
 		  // add 1 ulp and then check for overflow
 		  C1_lo = C1_lo + 1;
 		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		    C1_hi = C1_hi + 1;
 		  }
 		  if (C1_hi == 0x0001ed09bead87c0ull
 		      && C1_lo == 0x378d8e6400000000ull) {
 		    // C1 = 10^34 => rounding overflow
 		    C1_hi = 0x0000314dc6448d93ull;
 		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		    x_exp = x_exp + EXP_P1;
 		    if (x_exp == EXP_MAX_P1) {	// overflow
 		      C1_hi = 0x7800000000000000ull;	// +inf
 		      C1_lo = 0x0ull;
 		      x_exp = 0;	// x_sign is preserved
 		      // set overflow flag (the inexact flag was set too)
 		      *pfpsf |= OVERFLOW_EXCEPTION;
 		    }
 		  }
 		} else
 		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
 		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
 		      || (rnd_mode == ROUNDING_TO_ZERO
 			  && x_sign != y_sign)) {
 		  // subtract 1 ulp from C1
 		  // Note: because delta >= P34 + 1 the result cannot be zero
 		  C1_lo = C1_lo - 1;
 		  if (C1_lo == 0xffffffffffffffffull)
 		    C1_hi = C1_hi - 1;
 		  // if the coefficient is 10^33-1 then make it 10^34-1 and
 		  // decrease the exponent by 1 (because delta >= P34 + 1 the
 		  // exponent will not become less than e_min)
 		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 		  if (C1_hi == 0x0000314dc6448d93ull
 		      && C1_lo == 0x38c15b09ffffffffull) {
 		    // make C1 = 10^34  - 1
 		    C1_hi = 0x0001ed09bead87c0ull;
 		    C1_lo = 0x378d8e63ffffffffull;
 		    x_exp = x_exp - EXP_P1;
 		  }
 		} else {
 		  ;	// the result is already correct
 		}
 	      }
 	      // set the inexact flag
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      // assemble the result
 	      res.w[1] = x_sign | x_exp | C1_hi;
 	      res.w[0] = C1_lo;
 	    } else if ((C2_hi == halfulp128.w[1]
 			&& C2_lo == halfulp128.w[0])
 		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
 	      // midpoint & lsb in C1 is 0
 	      // n2 = 1/2 ulp (n1) and C1 is even
 	      // the result is the operand with the larger magnitude,
 	      // possibly scaled up by 10^(P34-q1)
 	      // an overflow cannot occur in this case (rounding to nearest)
 	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
 		// Note: because delta = P34 it is certain that
 		//     x_exp - ((UINT64)scale << 49) will stay above e_min
 		scale = P34 - q1;
 		if (q1 <= 19) {	// C1 fits in 64 bits
 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
 		  if (scale <= 19) {	// 10^scale fits in 64 bits
 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
 		  } else {	// if 20 <= scale <= 33
 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 		    // (C1 * 10^(scale-19)) fits in 64 bits
 		    C1_lo = C1_lo * ten2k64[scale - 19];
 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
 		  }
 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 		  C1.w[1] = C1_hi;
 		  C1.w[0] = C1_lo;
 		  // C1 = ten2k64[P34 - q1] * C1
 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
 		}
 		x_exp = x_exp - ((UINT64) scale << 49);
 		C1_hi = C1.w[1];
 		C1_lo = C1.w[0];
 	      }
 	      if (rnd_mode != ROUNDING_TO_NEAREST) {
 		if ((rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign)
 		    || (rnd_mode == ROUNDING_UP && !y_sign)) {
 		  // add 1 ulp and then check for overflow
 		  C1_lo = C1_lo + 1;
 		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		    C1_hi = C1_hi + 1;
 		  }
 		  if (C1_hi == 0x0001ed09bead87c0ull
 		      && C1_lo == 0x378d8e6400000000ull) {
 		    // C1 = 10^34 => rounding overflow
 		    C1_hi = 0x0000314dc6448d93ull;
 		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		    x_exp = x_exp + EXP_P1;
 		    if (x_exp == EXP_MAX_P1) {	// overflow
 		      C1_hi = 0x7800000000000000ull;	// +inf
 		      C1_lo = 0x0ull;
 		      x_exp = 0;	// x_sign is preserved
 		      // set overflow flag (the inexact flag was set too)
 		      *pfpsf |= OVERFLOW_EXCEPTION;
 		    }
 		  }
 		} else if ((rnd_mode == ROUNDING_DOWN && y_sign)
 			   || (rnd_mode == ROUNDING_TO_ZERO
 			       && x_sign != y_sign)) {
 		  // subtract 1 ulp from C1
 		  // Note: because delta >= P34 + 1 the result cannot be zero
 		  C1_lo = C1_lo - 1;
 		  if (C1_lo == 0xffffffffffffffffull)
 		    C1_hi = C1_hi - 1;
 		  // if the coefficient is 10^33 - 1 then make it 10^34 - 1
 		  // and decrease the exponent by 1 (because delta >= P34 + 1
 		  // the exponent will not become less than e_min)
 		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 		  if (C1_hi == 0x0000314dc6448d93ull
 		      && C1_lo == 0x38c15b09ffffffffull) {
 		    // make C1 = 10^34  - 1
 		    C1_hi = 0x0001ed09bead87c0ull;
 		    C1_lo = 0x378d8e63ffffffffull;
 		    x_exp = x_exp - EXP_P1;
 		  }
 		} else {
 		  ;	// the result is already correct
 		}
 	      }
 	      // set the inexact flag
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      // assemble the result
 	      res.w[1] = x_sign | x_exp | C1_hi;
 	      res.w[0] = C1_lo;
 	    } else {	// if C2 > halfulp128 ||
 	      // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
 	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
 	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
 	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
 		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
 		// because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
 		// and must decrease the exponent by (P34-q1) (it will still be
 		// at least e_min)
 		scale = P34 - q1;
 		if (q1 <= 19) {	// C1 fits in 64 bits
 		  // 1 <= q1 <= 19 => 15 <= scale <= 33
 		  if (scale <= 19) {	// 10^scale fits in 64 bits
 		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
 		  } else {	// if 20 <= scale <= 33
 		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 		    // (C1 * 10^(scale-19)) fits in 64 bits
 		    C1_lo = C1_lo * ten2k64[scale - 19];
 		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
 		  }
 		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 		  C1.w[1] = C1_hi;
 		  C1.w[0] = C1_lo;
 		  // C1 = ten2k64[P34 - q1] * C1
 		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
 		}
 		C1_hi = C1.w[1];
 		C1_lo = C1.w[0];
 		x_exp = x_exp - ((UINT64) scale << 49);
 	      }
 	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
 		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
 		      && (C2_hi != halfulp128.w[1]
 			  || C2_lo != halfulp128.w[0]))
 		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
 		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
 		  || (rnd_mode == ROUNDING_TO_ZERO
 		      && x_sign != y_sign)) {
 		// the result is x - 1
 		// for RN n1 * n2 < 0; underflow not possible
 		C1_lo = C1_lo - 1;
 		if (C1_lo == 0xffffffffffffffffull)
 		  C1_hi--;
 		// check if we crossed into the lower decade
 		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
 		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
 		  C1_lo = 0x378d8e63ffffffffull;
 		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
 		}
 	      } else
 		if ((rnd_mode == ROUNDING_TO_NEAREST
 		     && x_sign == y_sign)
 		    || (rnd_mode == ROUNDING_TIES_AWAY
 			&& x_sign == y_sign)
 		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
 		    || (rnd_mode == ROUNDING_UP && !x_sign
 			&& !y_sign)) {
 		// the result is x + 1
 		// for RN x_sign = y_sign, i.e. n1*n2 > 0
 		C1_lo = C1_lo + 1;
 		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		  C1_hi = C1_hi + 1;
 		}
 		if (C1_hi == 0x0001ed09bead87c0ull
 		    && C1_lo == 0x378d8e6400000000ull) {
 		  // C1 = 10^34 => rounding overflow
 		  C1_hi = 0x0000314dc6448d93ull;
 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		  x_exp = x_exp + EXP_P1;
 		  if (x_exp == EXP_MAX_P1) {	// overflow
 		    C1_hi = 0x7800000000000000ull;	// +inf
 		    C1_lo = 0x0ull;
 		    x_exp = 0;	// x_sign is preserved
 		    // set the overflow flag
 		    *pfpsf |= OVERFLOW_EXCEPTION;
 		  }
 		}
 	      } else {
 		;	// the result is x
 	      }
 	      // set the inexact flag
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      // assemble the result
 	      res.w[1] = x_sign | x_exp | C1_hi;
 	      res.w[0] = C1_lo;
 	    }
 	  }	// end q1 >= 20
 	  // end case where C1 != 10^(q1-1)
 	} else {	// C1 = 10^(q1-1) and x_sign != y_sign
 	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
 	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
 	  // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
 	  // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34
 	  // digits and n = C' * 10^(e2+x1)
 	  // If the result has P34+1 digits, redo the steps above with x1+1
 	  // If the result has P34-1 digits or less, redo the steps above with
 	  // x1-1 but only if initially x1 >= 1
 	  // NOTE: these two steps can be improved, e.g we could guess if
 	  // P34+1 or P34-1 digits will be obtained by adding/subtracting
 	  // just the top 64 bits of the two operands
 	  // The result cannot be zero, and it cannot overflow
 	  x1 = q2 - 1;	// 0 <= x1 <= P34-1
 	  // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
 	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
 	  scale = P34 - q1 + 1;	// scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
 	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
 	  // but their product fits with certainty in 128 bits
 	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
 	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
 	  } else {	// if (scale >= 1
 	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
 	    if (q1 <= 19) {	// C1 fits in 64 bits
 	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
 	    } else {	// q1 >= 20
 	      C1.w[1] = C1_hi;
 	      C1.w[0] = C1_lo;
 	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
 	    }
 	  }
 	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)

 	  // now round C2 to q2-x1 = 1 decimal digit
 	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
 	  ind = x1 - 1;	// -1 <= ind <= P34 - 2
 	  if (ind >= 0) {	// if (x1 >= 1)
 	    C2.w[0] = C2_lo;
 	    C2.w[1] = C2_hi;
 	    if (ind <= 18) {
 	      C2.w[0] = C2.w[0] + midpoint64[ind];
 	      if (C2.w[0] < C2_lo)
 		C2.w[1]++;
 	    } else {	// 19 <= ind <= 32
 	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
 	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
 	      if (C2.w[0] < C2_lo)
 		C2.w[1]++;
 	    }
 	    // the approximation of 10^(-x1) was rounded up to 118 bits
 	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
 	    // calculate C2* and f2*
 	    // C2* is actually floor(C2*) in this case
 	    // C2* and f2* need shifting and masking, as shown by
 	    // shiftright128[] and maskhigh128[]
 	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
 	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
 	    // if (0 < f2* < 10^(-x1)) then
 	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
 	    //       shift; C2* has p decimal digits, correct by Prop. 1)
 	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
 	    //       shift; C2* has p decimal digits, correct by Pr. 1)
 	    // else
 	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
 	    //       correct by Property 1)
 	    // n = C2* * 10^(e2+x1)

 	    if (ind <= 2) {
 	      highf2star.w[1] = 0x0;
 	      highf2star.w[0] = 0x0;	// low f2* ok
 	    } else if (ind <= 21) {
 	      highf2star.w[1] = 0x0;
 	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
 	    } else {
 	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
 	      highf2star.w[0] = R256.w[2];	// low f2* is ok
 	    }
 	    // shift right C2* by Ex-128 = shiftright128[ind]
 	    if (ind >= 3) {
 	      shift = shiftright128[ind];
 	      if (shift < 64) {	// 3 <= shift <= 63
 		R256.w[2] =
 		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
 		R256.w[3] = (R256.w[3] >> shift);
 	      } else {	// 66 <= shift <= 102
 		R256.w[2] = (R256.w[3] >> (shift - 64));
 		R256.w[3] = 0x0ULL;
 	      }
 	    }
 	    // redundant
 	    is_inexact_lt_midpoint = 0;
 	    is_inexact_gt_midpoint = 0;
 	    is_midpoint_lt_even = 0;
 	    is_midpoint_gt_even = 0;
 	    // determine inexactness of the rounding of C2*
 	    // (cannot be followed by a second rounding)
 	    // if (0 < f2* - 1/2 < 10^(-x1)) then
 	    //   the result is exact
 	    // else (if f2* - 1/2 > T* then)
 	    //   the result of is inexact
 	    if (ind <= 2) {
 	      if (R256.w[1] > 0x8000000000000000ull ||
 		  (R256.w[1] == 0x8000000000000000ull
 		   && R256.w[0] > 0x0ull)) {
 		// f2* > 1/2 and the result may be exact
 		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
 		if ((tmp64A > ten2mk128trunc[ind].w[1]
 		     || (tmp64A == ten2mk128trunc[ind].w[1]
 			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
 		  // set the inexact flag
 		  *pfpsf |= INEXACT_EXCEPTION;
 		  // this rounding is applied to C2 only!
 		  // x_sign != y_sign
 		  is_inexact_gt_midpoint = 1;
 		}	// else the result is exact
 		// rounding down, unless a midpoint in [ODD, EVEN]
 	      } else {	// the result is inexact; f2* <= 1/2
 		// set the inexact flag
 		*pfpsf |= INEXACT_EXCEPTION;
 		// this rounding is applied to C2 only!
 		// x_sign != y_sign
 		is_inexact_lt_midpoint = 1;
 	      }
 	    } else if (ind <= 21) {	// if 3 <= ind <= 21
 	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
 					    && highf2star.w[0] >
 					    onehalf128[ind])
 		  || (highf2star.w[1] == 0x0
 		      && highf2star.w[0] == onehalf128[ind]
 		      && (R256.w[1] || R256.w[0]))) {
 		// f2* > 1/2 and the result may be exact
 		// Calculate f2* - 1/2
 		tmp64A = highf2star.w[0] - onehalf128[ind];
 		tmp64B = highf2star.w[1];
 		if (tmp64A > highf2star.w[0])
 		  tmp64B--;
 		if (tmp64B || tmp64A
 		    || R256.w[1] > ten2mk128trunc[ind].w[1]
 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
 			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
 		  // set the inexact flag
 		  *pfpsf |= INEXACT_EXCEPTION;
 		  // this rounding is applied to C2 only!
 		  // x_sign != y_sign
 		  is_inexact_gt_midpoint = 1;
 		}	// else the result is exact
 	      } else {	// the result is inexact; f2* <= 1/2
 		// set the inexact flag
 		*pfpsf |= INEXACT_EXCEPTION;
 		// this rounding is applied to C2 only!
 		// x_sign != y_sign
 		is_inexact_lt_midpoint = 1;
 	      }
 	    } else {	// if 22 <= ind <= 33
 	      if (highf2star.w[1] > onehalf128[ind]
 		  || (highf2star.w[1] == onehalf128[ind]
 		      && (highf2star.w[0] || R256.w[1]
 			  || R256.w[0]))) {
 		// f2* > 1/2 and the result may be exact
 		// Calculate f2* - 1/2
 		// tmp64A = highf2star.w[0];
 		tmp64B = highf2star.w[1] - onehalf128[ind];
 		if (tmp64B || highf2star.w[0]
 		    || R256.w[1] > ten2mk128trunc[ind].w[1]
 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
 			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
 		  // set the inexact flag
 		  *pfpsf |= INEXACT_EXCEPTION;
 		  // this rounding is applied to C2 only!
 		  // x_sign != y_sign
 		  is_inexact_gt_midpoint = 1;
 		}	// else the result is exact
 	      } else {	// the result is inexact; f2* <= 1/2
 		// set the inexact flag
 		*pfpsf |= INEXACT_EXCEPTION;
 		// this rounding is applied to C2 only!
 		// x_sign != y_sign
 		is_inexact_lt_midpoint = 1;
 	      }
 	    }
 	    // check for midpoints after determining inexactness
 	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
 		&& (highf2star.w[0] == 0)
 		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
 			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
 	      // the result is a midpoint
 	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
 		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
 		R256.w[2]--;
 		if (R256.w[2] == 0xffffffffffffffffull)
 		  R256.w[3]--;
 		// this rounding is applied to C2 only!
 		// x_sign != y_sign
 		is_midpoint_lt_even = 1;
 		is_inexact_lt_midpoint = 0;
 		is_inexact_gt_midpoint = 0;
 	      } else {
 		// else MP in [ODD, EVEN]
 		// this rounding is applied to C2 only!
 		// x_sign != y_sign
 		is_midpoint_gt_even = 1;
 		is_inexact_lt_midpoint = 0;
 		is_inexact_gt_midpoint = 0;
 	      }
 	    }
 	  } else {	// if (ind == -1) only when x1 = 0
 	    R256.w[2] = C2_lo;
 	    R256.w[3] = C2_hi;
 	    is_midpoint_lt_even = 0;
 	    is_midpoint_gt_even = 0;
 	    is_inexact_lt_midpoint = 0;
 	    is_inexact_gt_midpoint = 0;
 	  }
 	  // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
 	  // because x_sign != y_sign this last operation is exact
 	  C1.w[0] = C1.w[0] - R256.w[2];
 	  C1.w[1] = C1.w[1] - R256.w[3];
 	  if (C1.w[0] > tmp64)
 	    C1.w[1]--;	// borrow
 	  if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
 	    C1.w[0] = ~C1.w[0];
 	    C1.w[0]++;
 	    C1.w[1] = ~C1.w[1];
 	    if (C1.w[0] == 0x0)
 	      C1.w[1]++;
 	    tmp_sign = y_sign;	// the result will have the sign of y
 	  } else {
 	    tmp_sign = x_sign;
 	  }
 	  // the difference has exactly P34 digits
 	  x_sign = tmp_sign;
 	  if (x1 >= 1)
 	    y_exp = y_exp + ((UINT64) x1 << 49);
 	  C1_hi = C1.w[1];
 	  C1_lo = C1.w[0];
 	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
 	  if (rnd_mode != ROUNDING_TO_NEAREST) {
 	    if ((!x_sign
 		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
 		     ||
 		     ((rnd_mode == ROUNDING_TIES_AWAY
 		       || rnd_mode == ROUNDING_UP)
 		      && is_midpoint_gt_even))) || (x_sign
 						    &&
 						    ((rnd_mode ==
 						      ROUNDING_DOWN
 						      &&
 						      is_inexact_lt_midpoint)
 						     ||
 						     ((rnd_mode ==
 						       ROUNDING_TIES_AWAY
 						       || rnd_mode ==
 						       ROUNDING_DOWN)
 						      &&
 						      is_midpoint_gt_even))))
 	    {
 	      // C1 = C1 + 1
 	      C1_lo = C1_lo + 1;
 	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		C1_hi = C1_hi + 1;
 	      }
 	      if (C1_hi == 0x0001ed09bead87c0ull
 		  && C1_lo == 0x378d8e6400000000ull) {
 		// C1 = 10^34 => rounding overflow
 		C1_hi = 0x0000314dc6448d93ull;
 		C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		y_exp = y_exp + EXP_P1;
 	      }
 	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
 		       &&
 		       ((x_sign
 			 && (rnd_mode == ROUNDING_UP
 			     || rnd_mode == ROUNDING_TO_ZERO))
 			|| (!x_sign
 			    && (rnd_mode == ROUNDING_DOWN
 				|| rnd_mode == ROUNDING_TO_ZERO)))) {
 	      // C1 = C1 - 1
 	      C1_lo = C1_lo - 1;
 	      if (C1_lo == 0xffffffffffffffffull)
 		C1_hi--;
 	      // check if we crossed into the lower decade
 	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
 		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
 		C1_lo = 0x378d8e63ffffffffull;
 		y_exp = y_exp - EXP_P1;
 		// no underflow, because delta + q2 >= P34 + 1
 	      }
 	    } else {
 	      ;	// exact, the result is already correct
 	    }
 	  }
 	  // assemble the result
 	  res.w[1] = x_sign | y_exp | C1_hi;
 	  res.w[0] = C1_lo;
 	}
       }	// end delta = P34
     } else {	// if (|delta| <= P34 - 1)
       if (delta >= 0) {	// if (0 <= delta <= P34 - 1)
 	if (delta <= P34 - 1 - q2) {
 	  // calculate C' directly; the result is exact
 	  // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
 	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
 	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
 	  // but their product fits with certainty in 128 bits (actually in 113)
 	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)

 	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
 	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
 	    C1_hi = C1.w[1];
 	    C1_lo = C1.w[0];
 	  } else if (scale >= 1) {
 	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
 	    if (q1 <= 19) {	// C1 fits in 64 bits
 	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
 	    } else {	// q1 >= 20
 	      C1.w[1] = C1_hi;
 	      C1.w[0] = C1_lo;
 	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
 	    }
 	    C1_hi = C1.w[1];
 	    C1_lo = C1.w[0];
 	  } else {	// if (scale == 0) C1 is unchanged
 	    C1.w[0] = C1_lo;	// C1.w[1] = C1_hi;
 	  }
 	  // now add C2
 	  if (x_sign == y_sign) {
 	    // the result cannot overflow
 	    C1_lo = C1_lo + C2_lo;
 	    C1_hi = C1_hi + C2_hi;
 	    if (C1_lo < C1.w[0])
 	      C1_hi++;
 	  } else {	// if x_sign != y_sign
 	    C1_lo = C1_lo - C2_lo;
 	    C1_hi = C1_hi - C2_hi;
 	    if (C1_lo > C1.w[0])
 	      C1_hi--;
 	    // the result can be zero, but it cannot overflow
 	    if (C1_lo == 0 && C1_hi == 0) {
 	      // assemble the result
 	      if (x_exp < y_exp)
 		res.w[1] = x_exp;
 	      else
 		res.w[1] = y_exp;
 	      res.w[0] = 0;
 	      if (rnd_mode == ROUNDING_DOWN) {
 		res.w[1] |= 0x8000000000000000ull;
 	      }
 	      BID_SWAP128 (res);
 	      BID_RETURN (res);
 	    }
 	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
 	      C1_lo = ~C1_lo;
 	      C1_lo++;
 	      C1_hi = ~C1_hi;
 	      if (C1_lo == 0x0)
 		C1_hi++;
 	      x_sign = y_sign;	// the result will have the sign of y
 	    }
 	  }
 	  // assemble the result
 	  res.w[1] = x_sign | y_exp | C1_hi;
 	  res.w[0] = C1_lo;
 	} else if (delta == P34 - q2) {
 	  // calculate C' directly; the result may be inexact if it requires
 	  // P34+1 decimal digits; in this case the 'cutoff' point for addition
 	  // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
 	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
 	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
 	  // but their product fits with certainty in 128 bits (actually in 113)
 	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
 	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
 	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
 	  } else if (scale >= 1) {
 	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
 	    if (q1 <= 19) {	// C1 fits in 64 bits
 	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
 	    } else {	// q1 >= 20
 	      C1.w[1] = C1_hi;
 	      C1.w[0] = C1_lo;
 	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
 	    }
 	  } else {	// if (scale == 0) C1 is unchanged
 	    C1.w[1] = C1_hi;
 	    C1.w[0] = C1_lo;	// only the low part is necessary
 	  }
 	  C1_hi = C1.w[1];
 	  C1_lo = C1.w[0];
 	  // now add C2
 	  if (x_sign == y_sign) {
 	    // the result can overflow!
 	    C1_lo = C1_lo + C2_lo;
 	    C1_hi = C1_hi + C2_hi;
 	    if (C1_lo < C1.w[0])
 	      C1_hi++;
 	    // test for overflow, possible only when C1 >= 10^34
 	    if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
 	      // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
 	      // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
 	      // decimal digits
 	      // Calculate C'' = C' + 1/2 * 10^x
 	      if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
 		C1_lo = C1_lo + 5;
 		C1_hi = C1_hi + 1;
 	      } else {
 		C1_lo = C1_lo + 5;
 	      }
 	      // the approximation of 10^(-1) was rounded up to 118 bits
 	      // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
 	      // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
 	      C1.w[1] = C1_hi;
 	      C1.w[0] = C1_lo;	// C''
 	      ten2m1.w[1] = 0x1999999999999999ull;
 	      ten2m1.w[0] = 0x9999999999999a00ull;
 	      __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
 	      // C* is actually floor(C*) in this case
 	      // the top Ex = 128 bits of 10^(-1) are
 	      // T* = 0x00199999999999999999999999999999
 	      // if (0 < f* < 10^(-x)) then
 	      //   if floor(C*) is even then C = floor(C*) - logical right
 	      //       shift; C has p decimal digits, correct by Prop. 1)
 	      //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
 	      //       shift; C has p decimal digits, correct by Pr. 1)
 	      // else
 	      //   C = floor(C*) (logical right shift; C has p decimal digits,
 	      //       correct by Property 1)
 	      // n = C * 10^(e2+x)
 	      if ((P256.w[1] || P256.w[0])
 		  && (P256.w[1] < 0x1999999999999999ull
 		      || (P256.w[1] == 0x1999999999999999ull
 			  && P256.w[0] <= 0x9999999999999999ull))) {
 		// the result is a midpoint
 		if (P256.w[2] & 0x01) {
 		  is_midpoint_gt_even = 1;
 		  // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
 		  P256.w[2]--;
 		  if (P256.w[2] == 0xffffffffffffffffull)
 		    P256.w[3]--;
 		} else {
 		  is_midpoint_lt_even = 1;
 		}
 	      }
 	      // n = Cstar * 10^(e2+1)
 	      y_exp = y_exp + EXP_P1;
 	      // C* != 10^P because C* has P34 digits
 	      // check for overflow
 	      if (y_exp == EXP_MAX_P1
 		  && (rnd_mode == ROUNDING_TO_NEAREST
 		      || rnd_mode == ROUNDING_TIES_AWAY)) {
 		// overflow for RN
 		res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
 		res.w[0] = 0x0ull;
 		// set the inexact flag
 		*pfpsf |= INEXACT_EXCEPTION;
 		// set the overflow flag
 		*pfpsf |= OVERFLOW_EXCEPTION;
 		BID_SWAP128 (res);
 		BID_RETURN (res);
 	      }
 	      // if (0 < f* - 1/2 < 10^(-x)) then
 	      //   the result of the addition is exact
 	      // else
 	      //   the result of the addition is inexact
 	      if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
 		tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
 		if ((tmp64 > 0x1999999999999999ull
 		     || (tmp64 == 0x1999999999999999ull
 			 && P256.w[0] >= 0x9999999999999999ull))) {
 		  // set the inexact flag
 		  *pfpsf |= INEXACT_EXCEPTION;
 		  is_inexact = 1;
 		}	// else the result is exact
 	      } else {	// the result is inexact
 		// set the inexact flag
 		*pfpsf |= INEXACT_EXCEPTION;
 		is_inexact = 1;
 	      }
 	      C1_hi = P256.w[3];
 	      C1_lo = P256.w[2];
 	      if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
 		is_inexact_lt_midpoint = is_inexact
 		  && (P256.w[1] & 0x8000000000000000ull);
 		is_inexact_gt_midpoint = is_inexact
 		  && !(P256.w[1] & 0x8000000000000000ull);
 	      }
 	      // general correction from RN to RA, RM, RP, RZ;
 	      // result uses y_exp
 	      if (rnd_mode != ROUNDING_TO_NEAREST) {
 		if ((!x_sign
 		     &&
 		     ((rnd_mode == ROUNDING_UP
 		       && is_inexact_lt_midpoint)
 		      ||
 		      ((rnd_mode == ROUNDING_TIES_AWAY
 			|| rnd_mode == ROUNDING_UP)
 		       && is_midpoint_gt_even))) || (x_sign
 						     &&
 						     ((rnd_mode ==
 						       ROUNDING_DOWN
 						       &&
 						       is_inexact_lt_midpoint)
 						      ||
 						      ((rnd_mode ==
 							ROUNDING_TIES_AWAY
 							|| rnd_mode ==
 							ROUNDING_DOWN)
 						       &&
 						       is_midpoint_gt_even))))
 		{
 		  // C1 = C1 + 1
 		  C1_lo = C1_lo + 1;
 		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		    C1_hi = C1_hi + 1;
 		  }
 		  if (C1_hi == 0x0001ed09bead87c0ull
 		      && C1_lo == 0x378d8e6400000000ull) {
 		    // C1 = 10^34 => rounding overflow
 		    C1_hi = 0x0000314dc6448d93ull;
 		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		    y_exp = y_exp + EXP_P1;
 		  }
 		} else
 		  if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
 		      &&
 		      ((x_sign
 			&& (rnd_mode == ROUNDING_UP
 			    || rnd_mode == ROUNDING_TO_ZERO))
 		       || (!x_sign
 			   && (rnd_mode == ROUNDING_DOWN
 			       || rnd_mode == ROUNDING_TO_ZERO)))) {
 		  // C1 = C1 - 1
 		  C1_lo = C1_lo - 1;
 		  if (C1_lo == 0xffffffffffffffffull)
 		    C1_hi--;
 		  // check if we crossed into the lower decade
 		  if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
 		    C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
 		    C1_lo = 0x378d8e63ffffffffull;
 		    y_exp = y_exp - EXP_P1;
 		    // no underflow, because delta + q2 >= P34 + 1
 		  }
 		} else {
 		  ;	// exact, the result is already correct
 		}
 		// in all cases check for overflow (RN and RA solved already)
 		if (y_exp == EXP_MAX_P1) {	// overflow
 		  if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
 		      (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
 		    C1_hi = 0x7800000000000000ull;	// +inf
 		    C1_lo = 0x0ull;
 		  } else {	// RM and res > 0, RP and res < 0, or RZ
 		    C1_hi = 0x5fffed09bead87c0ull;
 		    C1_lo = 0x378d8e63ffffffffull;
 		  }
 		  y_exp = 0;	// x_sign is preserved
 		  // set the inexact flag (in case the exact addition was exact)
 		  *pfpsf |= INEXACT_EXCEPTION;
 		  // set the overflow flag
 		  *pfpsf |= OVERFLOW_EXCEPTION;
 		}
 	      }
 	    }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
 	  } else {	// if x_sign != y_sign the result is exact
 	    C1_lo = C1_lo - C2_lo;
 	    C1_hi = C1_hi - C2_hi;
 	    if (C1_lo > C1.w[0])
 	      C1_hi--;
 	    // the result can be zero, but it cannot overflow
 	    if (C1_lo == 0 && C1_hi == 0) {
 	      // assemble the result
 	      if (x_exp < y_exp)
 		res.w[1] = x_exp;
 	      else
 		res.w[1] = y_exp;
 	      res.w[0] = 0;
 	      if (rnd_mode == ROUNDING_DOWN) {
 		res.w[1] |= 0x8000000000000000ull;
 	      }
 	      BID_SWAP128 (res);
 	      BID_RETURN (res);
 	    }
 	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
 	      C1_lo = ~C1_lo;
 	      C1_lo++;
 	      C1_hi = ~C1_hi;
 	      if (C1_lo == 0x0)
 		C1_hi++;
 	      x_sign = y_sign;	// the result will have the sign of y
 	    }
 	  }
 	  // assemble the result
 	  res.w[1] = x_sign | y_exp | C1_hi;
 	  res.w[0] = C1_lo;
 	} else {	// if (delta >= P34 + 1 - q2)
 	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
 	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
 	  // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
 	  // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
 	  // If the result has P34+1 digits, redo the steps above with x1+1
 	  // If the result has P34-1 digits or less, redo the steps above with
 	  // x1-1 but only if initially x1 >= 1
 	  // NOTE: these two steps can be improved, e.g we could guess if
 	  // P34+1 or P34-1 digits will be obtained by adding/subtracting just
 	  // the top 64 bits of the two operands
 	  // The result cannot be zero, but it can overflow
 	  x1 = delta + q2 - P34;	// 1 <= x1 <= P34-1
 	roundC2:
 	  // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
 	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
 	  scale = delta - q1 + q2 - x1;	// scale = e1 - e2 - x1 = P34 - q1
 	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
 	  // but their product fits with certainty in 128 bits (actually in 113)
 	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
 	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
 	  } else if (scale >= 1) {
 	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
 	    if (q1 <= 19) {	// C1 fits in 64 bits
 	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
 	    } else {	// q1 >= 20
 	      C1.w[1] = C1_hi;
 	      C1.w[0] = C1_lo;
 	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
 	    }
 	  } else {	// if (scale == 0) C1 is unchanged
 	    C1.w[1] = C1_hi;
 	    C1.w[0] = C1_lo;
 	  }
 	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)

 	  // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
 	  // (but if we got here a second time after x1 = x1 - 1, then
 	  // x1 >= 0; note that for x1 = 0 C2 is unchanged)
 	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
 	  ind = x1 - 1;	// 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
 	  // during a second pass, then ind = -1
 	  if (ind >= 0) {	// if (x1 >= 1)
 	    C2.w[0] = C2_lo;
 	    C2.w[1] = C2_hi;
 	    if (ind <= 18) {
 	      C2.w[0] = C2.w[0] + midpoint64[ind];
 	      if (C2.w[0] < C2_lo)
 		C2.w[1]++;
 	    } else {	// 19 <= ind <= 32
 	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
 	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
 	      if (C2.w[0] < C2_lo)
 		C2.w[1]++;
 	    }
 	    // the approximation of 10^(-x1) was rounded up to 118 bits
 	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
 	    // calculate C2* and f2*
 	    // C2* is actually floor(C2*) in this case
 	    // C2* and f2* need shifting and masking, as shown by
 	    // shiftright128[] and maskhigh128[]
 	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
 	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
 	    // if (0 < f2* < 10^(-x1)) then
 	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
 	    //       shift; C2* has p decimal digits, correct by Prop. 1)
 	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
 	    //       shift; C2* has p decimal digits, correct by Pr. 1)
 	    // else
 	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
 	    //       correct by Property 1)
 	    // n = C2* * 10^(e2+x1)

 	    if (ind <= 2) {
 	      highf2star.w[1] = 0x0;
 	      highf2star.w[0] = 0x0;	// low f2* ok
 	    } else if (ind <= 21) {
 	      highf2star.w[1] = 0x0;
 	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
 	    } else {
 	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
 	      highf2star.w[0] = R256.w[2];	// low f2* is ok
 	    }
 	    // shift right C2* by Ex-128 = shiftright128[ind]
 	    if (ind >= 3) {
 	      shift = shiftright128[ind];
 	      if (shift < 64) {	// 3 <= shift <= 63
 		R256.w[2] =
 		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
 		R256.w[3] = (R256.w[3] >> shift);
 	      } else {	// 66 <= shift <= 102
 		R256.w[2] = (R256.w[3] >> (shift - 64));
 		R256.w[3] = 0x0ULL;
 	      }
 	    }
 	    if (second_pass) {
 	      is_inexact_lt_midpoint = 0;
 	      is_inexact_gt_midpoint = 0;
 	      is_midpoint_lt_even = 0;
 	      is_midpoint_gt_even = 0;
 	    }
 	    // determine inexactness of the rounding of C2* (this may be
 	    // followed by a second rounding only if we get P34+1
 	    // decimal digits)
 	    // if (0 < f2* - 1/2 < 10^(-x1)) then
 	    //   the result is exact
 	    // else (if f2* - 1/2 > T* then)
 	    //   the result of is inexact
 	    if (ind <= 2) {
 	      if (R256.w[1] > 0x8000000000000000ull ||
 		  (R256.w[1] == 0x8000000000000000ull
 		   && R256.w[0] > 0x0ull)) {
 		// f2* > 1/2 and the result may be exact
 		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
 		if ((tmp64A > ten2mk128trunc[ind].w[1]
 		     || (tmp64A == ten2mk128trunc[ind].w[1]
 			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
 		  // set the inexact flag
 		  // *pfpsf |= INEXACT_EXCEPTION;
 		  tmp_inexact = 1;	// may be set again during a second pass
 		  // this rounding is applied to C2 only!
 		  if (x_sign == y_sign)
 		    is_inexact_lt_midpoint = 1;
 		  else	// if (x_sign != y_sign)
 		    is_inexact_gt_midpoint = 1;
 		}	// else the result is exact
 		// rounding down, unless a midpoint in [ODD, EVEN]
 	      } else {	// the result is inexact; f2* <= 1/2
 		// set the inexact flag
 		// *pfpsf |= INEXACT_EXCEPTION;
 		tmp_inexact = 1;	// just in case we will round a second time
 		// rounding up, unless a midpoint in [EVEN, ODD]
 		// this rounding is applied to C2 only!
 		if (x_sign == y_sign)
 		  is_inexact_gt_midpoint = 1;
 		else	// if (x_sign != y_sign)
 		  is_inexact_lt_midpoint = 1;
 	      }
 	    } else if (ind <= 21) {	// if 3 <= ind <= 21
 	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
 					    && highf2star.w[0] >
 					    onehalf128[ind])
 		  || (highf2star.w[1] == 0x0
 		      && highf2star.w[0] == onehalf128[ind]
 		      && (R256.w[1] || R256.w[0]))) {
 		// f2* > 1/2 and the result may be exact
 		// Calculate f2* - 1/2
 		tmp64A = highf2star.w[0] - onehalf128[ind];
 		tmp64B = highf2star.w[1];
 		if (tmp64A > highf2star.w[0])
 		  tmp64B--;
 		if (tmp64B || tmp64A
 		    || R256.w[1] > ten2mk128trunc[ind].w[1]
 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
 			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
 		  // set the inexact flag
 		  // *pfpsf |= INEXACT_EXCEPTION;
 		  tmp_inexact = 1;	// may be set again during a second pass
 		  // this rounding is applied to C2 only!
 		  if (x_sign == y_sign)
 		    is_inexact_lt_midpoint = 1;
 		  else	// if (x_sign != y_sign)
 		    is_inexact_gt_midpoint = 1;
 		}	// else the result is exact
 	      } else {	// the result is inexact; f2* <= 1/2
 		// set the inexact flag
 		// *pfpsf |= INEXACT_EXCEPTION;
 		tmp_inexact = 1;	// may be set again during a second pass
 		// rounding up, unless a midpoint in [EVEN, ODD]
 		// this rounding is applied to C2 only!
 		if (x_sign == y_sign)
 		  is_inexact_gt_midpoint = 1;
 		else	// if (x_sign != y_sign)
 		  is_inexact_lt_midpoint = 1;
 	      }
 	    } else {	// if 22 <= ind <= 33
 	      if (highf2star.w[1] > onehalf128[ind]
 		  || (highf2star.w[1] == onehalf128[ind]
 		      && (highf2star.w[0] || R256.w[1]
 			  || R256.w[0]))) {
 		// f2* > 1/2 and the result may be exact
 		// Calculate f2* - 1/2
 		// tmp64A = highf2star.w[0];
 		tmp64B = highf2star.w[1] - onehalf128[ind];
 		if (tmp64B || highf2star.w[0]
 		    || R256.w[1] > ten2mk128trunc[ind].w[1]
 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
 			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
 		  // set the inexact flag
 		  // *pfpsf |= INEXACT_EXCEPTION;
 		  tmp_inexact = 1;	// may be set again during a second pass
 		  // this rounding is applied to C2 only!
 		  if (x_sign == y_sign)
 		    is_inexact_lt_midpoint = 1;
 		  else	// if (x_sign != y_sign)
 		    is_inexact_gt_midpoint = 1;
 		}	// else the result is exact
 	      } else {	// the result is inexact; f2* <= 1/2
 		// set the inexact flag
 		// *pfpsf |= INEXACT_EXCEPTION;
 		tmp_inexact = 1;	// may be set again during a second pass
 		// rounding up, unless a midpoint in [EVEN, ODD]
 		// this rounding is applied to C2 only!
 		if (x_sign == y_sign)
 		  is_inexact_gt_midpoint = 1;
 		else	// if (x_sign != y_sign)
 		  is_inexact_lt_midpoint = 1;
 	      }
 	    }
 	    // check for midpoints
 	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
 		&& (highf2star.w[0] == 0)
 		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
 		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
 			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
 	      // the result is a midpoint
 	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
 		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
 		R256.w[2]--;
 		if (R256.w[2] == 0xffffffffffffffffull)
 		  R256.w[3]--;
 		// this rounding is applied to C2 only!
 		if (x_sign == y_sign)
 		  is_midpoint_gt_even = 1;
 		else	// if (x_sign != y_sign)
 		  is_midpoint_lt_even = 1;
 		is_inexact_lt_midpoint = 0;
 		is_inexact_gt_midpoint = 0;
 	      } else {
 		// else MP in [ODD, EVEN]
 		// this rounding is applied to C2 only!
 		if (x_sign == y_sign)
 		  is_midpoint_lt_even = 1;
 		else	// if (x_sign != y_sign)
 		  is_midpoint_gt_even = 1;
 		is_inexact_lt_midpoint = 0;
 		is_inexact_gt_midpoint = 0;
 	      }
 	    }
 	    // end if (ind >= 0)
 	  } else {	// if (ind == -1); only during a 2nd pass, and when x1 = 0
 	    R256.w[2] = C2_lo;
 	    R256.w[3] = C2_hi;
 	    tmp_inexact = 0;
 	    // to correct a possible setting to 1 from 1st pass
 	    if (second_pass) {
 	      is_midpoint_lt_even = 0;
 	      is_midpoint_gt_even = 0;
 	      is_inexact_lt_midpoint = 0;
 	      is_inexact_gt_midpoint = 0;
 	    }
 	  }
 	  // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
 	  if (x_sign == y_sign) {	// addition; could overflow
 	    // no second pass is possible this way (only for x_sign != y_sign)
 	    C1.w[0] = C1.w[0] + R256.w[2];
 	    C1.w[1] = C1.w[1] + R256.w[3];
 	    if (C1.w[0] < tmp64)
 	      C1.w[1]++;	// carry
 	    // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
 	    // with x1=x1+1
 	    if (C1.w[1] > 0x0001ed09bead87c0ull || (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
 	      // chop off one more digit from the sum, but make sure there is
 	      // no double-rounding error (see table - double rounding logic)
 	      // now round C1 from P34+1 to P34 decimal digits
 	      // C1' = C1 + 1/2 * 10 = C1 + 5
 	      if (C1.w[0] >= 0xfffffffffffffffbull) {	// low half add has carry
 		C1.w[0] = C1.w[0] + 5;
 		C1.w[1] = C1.w[1] + 1;
 	      } else {
 		C1.w[0] = C1.w[0] + 5;
 	      }
 	      // the approximation of 10^(-1) was rounded up to 118 bits
 	      __mul_128x128_to_256 (Q256, C1, ten2mk128[0]);	// Q256 = C1*, f1*
 	      // C1* is actually floor(C1*) in this case
 	      // the top 128 bits of 10^(-1) are
 	      // T* = ten2mk128trunc[0]=0x19999999999999999999999999999999
 	      // if (0 < f1* < 10^(-1)) then
 	      //   if floor(C1*) is even then C1* = floor(C1*) - logical right
 	      //       shift; C1* has p decimal digits, correct by Prop. 1)
 	      //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
 	      //       shift; C1* has p decimal digits, correct by Pr. 1)
 	      // else
 	      //   C1* = floor(C1*) (logical right shift; C has p decimal digits
 	      //       correct by Property 1)
 	      // n = C1* * 10^(e2+x1+1)
 	      if ((Q256.w[1] || Q256.w[0])
 		  && (Q256.w[1] < ten2mk128trunc[0].w[1]
 		      || (Q256.w[1] == ten2mk128trunc[0].w[1]
 			  && Q256.w[0] <= ten2mk128trunc[0].w[0]))) {
 		// the result is a midpoint
 		if (is_inexact_lt_midpoint) {	// for the 1st rounding
 		  is_inexact_gt_midpoint = 1;
 		  is_inexact_lt_midpoint = 0;
 		  is_midpoint_gt_even = 0;
 		  is_midpoint_lt_even = 0;
 		} else if (is_inexact_gt_midpoint) {	// for the 1st rounding
 		  Q256.w[2]--;
 		  if (Q256.w[2] == 0xffffffffffffffffull)
 		    Q256.w[3]--;
 		  is_inexact_gt_midpoint = 0;
 		  is_inexact_lt_midpoint = 1;
 		  is_midpoint_gt_even = 0;
 		  is_midpoint_lt_even = 0;
 		} else if (is_midpoint_gt_even) {	// for the 1st rounding
 		  // Note: cannot have is_midpoint_lt_even
 		  is_inexact_gt_midpoint = 0;
 		  is_inexact_lt_midpoint = 1;
 		  is_midpoint_gt_even = 0;
 		  is_midpoint_lt_even = 0;
 		} else {	// the first rounding must have been exact
 		  if (Q256.w[2] & 0x01) {	// MP in [EVEN, ODD]
 		    // the truncated result is correct
 		    Q256.w[2]--;
 		    if (Q256.w[2] == 0xffffffffffffffffull)
 		      Q256.w[3]--;
 		    is_inexact_gt_midpoint = 0;
 		    is_inexact_lt_midpoint = 0;
 		    is_midpoint_gt_even = 1;
 		    is_midpoint_lt_even = 0;
 		  } else {	// MP in [ODD, EVEN]
 		    is_inexact_gt_midpoint = 0;
 		    is_inexact_lt_midpoint = 0;
 		    is_midpoint_gt_even = 0;
 		    is_midpoint_lt_even = 1;
 		  }
 		}
 		tmp_inexact = 1;	// in all cases
 	      } else {	// the result is not a midpoint
 		// determine inexactness of the rounding of C1 (the sum C1+C2*)
 		// if (0 < f1* - 1/2 < 10^(-1)) then
 		//   the result is exact
 		// else (if f1* - 1/2 > T* then)
 		//   the result of is inexact
 		// ind = 0
 		if (Q256.w[1] > 0x8000000000000000ull
 		    || (Q256.w[1] == 0x8000000000000000ull
 			&& Q256.w[0] > 0x0ull)) {
 		  // f1* > 1/2 and the result may be exact
 		  Q256.w[1] = Q256.w[1] - 0x8000000000000000ull;	// f1* - 1/2
 		  if ((Q256.w[1] > ten2mk128trunc[0].w[1]
 		       || (Q256.w[1] == ten2mk128trunc[0].w[1]
 			   && Q256.w[0] > ten2mk128trunc[0].w[0]))) {
 		    is_inexact_gt_midpoint = 0;
 		    is_inexact_lt_midpoint = 1;
 		    is_midpoint_gt_even = 0;
 		    is_midpoint_lt_even = 0;
 		    // set the inexact flag
 		    tmp_inexact = 1;
 		    // *pfpsf |= INEXACT_EXCEPTION;
 		  } else {	// else the result is exact for the 2nd rounding
 		    if (tmp_inexact) {	// if the previous rounding was inexact
 		      if (is_midpoint_lt_even) {
 			is_inexact_gt_midpoint = 1;
 			is_midpoint_lt_even = 0;
 		      } else if (is_midpoint_gt_even) {
 			is_inexact_lt_midpoint = 1;
 			is_midpoint_gt_even = 0;
 		      } else {
 			;	// no change
 		      }
 		    }
 		  }
 		  // rounding down, unless a midpoint in [ODD, EVEN]
 		} else {	// the result is inexact; f1* <= 1/2
 		  is_inexact_gt_midpoint = 1;
 		  is_inexact_lt_midpoint = 0;
 		  is_midpoint_gt_even = 0;
 		  is_midpoint_lt_even = 0;
 		  // set the inexact flag
 		  tmp_inexact = 1;
 		  // *pfpsf |= INEXACT_EXCEPTION;
 		}
 	      }	// end 'the result is not a midpoint'
 	      // n = C1 * 10^(e2+x1)
 	      C1.w[1] = Q256.w[3];
 	      C1.w[0] = Q256.w[2];
 	      y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
 	    } else {	// C1 < 10^34
 	      // C1.w[1] and C1.w[0] already set
 	      // n = C1 * 10^(e2+x1)
 	      y_exp = y_exp + ((UINT64) x1 << 49);
 	    }
 	    // check for overflow
 	    if (y_exp == EXP_MAX_P1
 		&& (rnd_mode == ROUNDING_TO_NEAREST
 		    || rnd_mode == ROUNDING_TIES_AWAY)) {
 	      res.w[1] = 0x7800000000000000ull | x_sign;	// +/-inf
 	      res.w[0] = 0x0ull;
 	      // set the inexact flag
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      // set the overflow flag
 	      *pfpsf |= OVERFLOW_EXCEPTION;
 	      BID_SWAP128 (res);
 	      BID_RETURN (res);
 	    }	// else no overflow
 	  } else {	// if x_sign != y_sign the result of this subtract. is exact
 	    C1.w[0] = C1.w[0] - R256.w[2];
 	    C1.w[1] = C1.w[1] - R256.w[3];
 	    if (C1.w[0] > tmp64)
 	      C1.w[1]--;	// borrow
 	    if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
 	      C1.w[0] = ~C1.w[0];
 	      C1.w[0]++;
 	      C1.w[1] = ~C1.w[1];
 	      if (C1.w[0] == 0x0)
 		C1.w[1]++;
 	      tmp_sign = y_sign;
 	      // the result will have the sign of y if last rnd
 	    } else {
 	      tmp_sign = x_sign;
 	    }
 	    // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
 	    //   redo the calculation with x1=x1-1;
 	    // redo the calculation also if C1 = 10^33 and
 	    //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
 	    //   (the last part should have really been
 	    //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
 	    //    the rounding of C2, but the position flags have been reversed)
 	    // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
 	    if ((C1.w[1] < 0x0000314dc6448d93ull || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] < 0x38c15b0a00000000ull)) || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] == 0x38c15b0a00000000ull && (is_inexact_gt_midpoint || is_midpoint_lt_even))) {	// C1=10^33
 	      x1 = x1 - 1;	// x1 >= 0
 	      if (x1 >= 0) {
 		// clear position flags and tmp_inexact
 		is_midpoint_lt_even = 0;
 		is_midpoint_gt_even = 0;
 		is_inexact_lt_midpoint = 0;
 		is_inexact_gt_midpoint = 0;
 		tmp_inexact = 0;
 		second_pass = 1;
 		goto roundC2;	// else result has less than P34 digits
 	      }
 	    }
 	    // if the coefficient of the result is 10^34 it means that this
 	    // must be the second pass, and we are done
 	    if (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] == 0x378d8e6400000000ull) {	// if  C1 = 10^34
 	      C1.w[1] = 0x0000314dc6448d93ull;	// C1 = 10^33
 	      C1.w[0] = 0x38c15b0a00000000ull;
 	      y_exp = y_exp + ((UINT64) 1 << 49);
 	    }
 	    x_sign = tmp_sign;
 	    if (x1 >= 1)
 	      y_exp = y_exp + ((UINT64) x1 << 49);
 	    // x1 = -1 is possible at the end of a second pass when the
 	    // first pass started with x1 = 1
 	  }
 	  C1_hi = C1.w[1];
 	  C1_lo = C1.w[0];
 	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
 	  if (rnd_mode != ROUNDING_TO_NEAREST) {
 	    if ((!x_sign
 		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
 		     ||
 		     ((rnd_mode == ROUNDING_TIES_AWAY
 		       || rnd_mode == ROUNDING_UP)
 		      && is_midpoint_gt_even))) || (x_sign
 						    &&
 						    ((rnd_mode ==
 						      ROUNDING_DOWN
 						      &&
 						      is_inexact_lt_midpoint)
 						     ||
 						     ((rnd_mode ==
 						       ROUNDING_TIES_AWAY
 						       || rnd_mode ==
 						       ROUNDING_DOWN)
 						      &&
 						      is_midpoint_gt_even))))
 	    {
 	      // C1 = C1 + 1
 	      C1_lo = C1_lo + 1;
 	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		C1_hi = C1_hi + 1;
 	      }
 	      if (C1_hi == 0x0001ed09bead87c0ull
 		  && C1_lo == 0x378d8e6400000000ull) {
 		// C1 = 10^34 => rounding overflow
 		C1_hi = 0x0000314dc6448d93ull;
 		C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		y_exp = y_exp + EXP_P1;
 	      }
 	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
 		       &&
 		       ((x_sign
 			 && (rnd_mode == ROUNDING_UP
 			     || rnd_mode == ROUNDING_TO_ZERO))
 			|| (!x_sign
 			    && (rnd_mode == ROUNDING_DOWN
 				|| rnd_mode == ROUNDING_TO_ZERO)))) {
 	      // C1 = C1 - 1
 	      C1_lo = C1_lo - 1;
 	      if (C1_lo == 0xffffffffffffffffull)
 		C1_hi--;
 	      // check if we crossed into the lower decade
 	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
 		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
 		C1_lo = 0x378d8e63ffffffffull;
 		y_exp = y_exp - EXP_P1;
 		// no underflow, because delta + q2 >= P34 + 1
 	      }
 	    } else {
 	      ;	// exact, the result is already correct
 	    }
 	    // in all cases check for overflow (RN and RA solved already)
 	    if (y_exp == EXP_MAX_P1) {	// overflow
 	      if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
 		  (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
 		C1_hi = 0x7800000000000000ull;	// +inf
 		C1_lo = 0x0ull;
 	      } else {	// RM and res > 0, RP and res < 0, or RZ
 		C1_hi = 0x5fffed09bead87c0ull;
 		C1_lo = 0x378d8e63ffffffffull;
 	      }
 	      y_exp = 0;	// x_sign is preserved
 	      // set the inexact flag (in case the exact addition was exact)
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      // set the overflow flag
 	      *pfpsf |= OVERFLOW_EXCEPTION;
 	    }
 	  }
 	  // assemble the result
 	  res.w[1] = x_sign | y_exp | C1_hi;
 	  res.w[0] = C1_lo;
 	  if (tmp_inexact)
 	    *pfpsf |= INEXACT_EXCEPTION;
 	}
       } else {	// if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
 	// NOTE: the following, up to "} else { // if x_sign != y_sign
 	// the result is exact" is identical to "else if (delta == P34 - q2) {"
 	// from above; also, the code is not symmetric: a+b and b+a may take
 	// different paths (need to unify eventually!)
 	// calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be
 	// inexact if it requires P34 + 1 decimal digits; in either case the
 	// 'cutoff' point for addition is at the position of the lsb of C2
 	// The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
 	// exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
 	// but their product fits with certainty in 128 bits (actually in 113)
 	// Note that 0 <= e1 - e2 <= P34 - 2
 	//   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
 	//   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
 	//   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
 	//   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
 	scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
 	if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
 	  __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
 	} else if (scale >= 1) {
 	  // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
 	  if (q1 <= 19) {	// C1 fits in 64 bits
 	    __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
 	  } else {	// q1 >= 20
 	    C1.w[1] = C1_hi;
 	    C1.w[0] = C1_lo;
 	    __mul_128x64_to_128 (C1, ten2k64[scale], C1);
 	  }
 	} else {	// if (scale == 0) C1 is unchanged
 	  C1.w[1] = C1_hi;
 	  C1.w[0] = C1_lo;	// only the low part is necessary
 	}
 	C1_hi = C1.w[1];
 	C1_lo = C1.w[0];
 	// now add C2
 	if (x_sign == y_sign) {
 	  // the result can overflow!
 	  C1_lo = C1_lo + C2_lo;
 	  C1_hi = C1_hi + C2_hi;
 	  if (C1_lo < C1.w[0])
 	    C1_hi++;
 	  // test for overflow, possible only when C1 >= 10^34
 	  if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
 	    // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
 	    // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
 	    // decimal digits
 	    // Calculate C'' = C' + 1/2 * 10^x
 	    if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
 	      C1_lo = C1_lo + 5;
 	      C1_hi = C1_hi + 1;
 	    } else {
 	      C1_lo = C1_lo + 5;
 	    }
 	    // the approximation of 10^(-1) was rounded up to 118 bits
 	    // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
 	    // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
 	    C1.w[1] = C1_hi;
 	    C1.w[0] = C1_lo;	// C''
 	    ten2m1.w[1] = 0x1999999999999999ull;
 	    ten2m1.w[0] = 0x9999999999999a00ull;
 	    __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
 	    // C* is actually floor(C*) in this case
 	    // the top Ex = 128 bits of 10^(-1) are
 	    // T* = 0x00199999999999999999999999999999
 	    // if (0 < f* < 10^(-x)) then
 	    //   if floor(C*) is even then C = floor(C*) - logical right
 	    //       shift; C has p decimal digits, correct by Prop. 1)
 	    //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
 	    //       shift; C has p decimal digits, correct by Pr. 1)
 	    // else
 	    //   C = floor(C*) (logical right shift; C has p decimal digits,
 	    //       correct by Property 1)
 	    // n = C * 10^(e2+x)
 	    if ((P256.w[1] || P256.w[0])
 		&& (P256.w[1] < 0x1999999999999999ull
 		    || (P256.w[1] == 0x1999999999999999ull
 			&& P256.w[0] <= 0x9999999999999999ull))) {
 	      // the result is a midpoint
 	      if (P256.w[2] & 0x01) {
 		is_midpoint_gt_even = 1;
 		// if floor(C*) is odd C = floor(C*) - 1; the result is not 0
 		P256.w[2]--;
 		if (P256.w[2] == 0xffffffffffffffffull)
 		  P256.w[3]--;
 	      } else {
 		is_midpoint_lt_even = 1;
 	      }
 	    }
 	    // n = Cstar * 10^(e2+1)
 	    y_exp = y_exp + EXP_P1;
 	    // C* != 10^P34 because C* has P34 digits
 	    // check for overflow
 	    if (y_exp == EXP_MAX_P1
 		&& (rnd_mode == ROUNDING_TO_NEAREST
 		    || rnd_mode == ROUNDING_TIES_AWAY)) {
 	      // overflow for RN
 	      res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
 	      res.w[0] = 0x0ull;
 	      // set the inexact flag
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      // set the overflow flag
 	      *pfpsf |= OVERFLOW_EXCEPTION;
 	      BID_SWAP128 (res);
 	      BID_RETURN (res);
 	    }
 	    // if (0 < f* - 1/2 < 10^(-x)) then
 	    //   the result of the addition is exact
 	    // else
 	    //   the result of the addition is inexact
 	    if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
 	      tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
 	      if ((tmp64 > 0x1999999999999999ull
 		   || (tmp64 == 0x1999999999999999ull
 		       && P256.w[0] >= 0x9999999999999999ull))) {
 		// set the inexact flag
 		*pfpsf |= INEXACT_EXCEPTION;
 		is_inexact = 1;
 	      }	// else the result is exact
 	    } else {	// the result is inexact
 	      // set the inexact flag
 	      *pfpsf |= INEXACT_EXCEPTION;
 	      is_inexact = 1;
 	    }
 	    C1_hi = P256.w[3];
 	    C1_lo = P256.w[2];
 	    if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
 	      is_inexact_lt_midpoint = is_inexact
 		&& (P256.w[1] & 0x8000000000000000ull);
 	      is_inexact_gt_midpoint = is_inexact
 		&& !(P256.w[1] & 0x8000000000000000ull);
 	    }
 	    // general correction from RN to RA, RM, RP, RZ; result uses y_exp
 	    if (rnd_mode != ROUNDING_TO_NEAREST) {
 	      if ((!x_sign
 		   && ((rnd_mode == ROUNDING_UP
 			&& is_inexact_lt_midpoint)
 		       || ((rnd_mode == ROUNDING_TIES_AWAY
 			    || rnd_mode == ROUNDING_UP)
 			   && is_midpoint_gt_even))) || (x_sign
 							 &&
 							 ((rnd_mode ==
 							   ROUNDING_DOWN
 							   &&
 							   is_inexact_lt_midpoint)
 							  ||
 							  ((rnd_mode ==
 							    ROUNDING_TIES_AWAY
 							    || rnd_mode
 							    ==
 							    ROUNDING_DOWN)
 							   &&
 							   is_midpoint_gt_even))))
 	      {
 		// C1 = C1 + 1
 		C1_lo = C1_lo + 1;
 		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
 		  C1_hi = C1_hi + 1;
 		}
 		if (C1_hi == 0x0001ed09bead87c0ull
 		    && C1_lo == 0x378d8e6400000000ull) {
 		  // C1 = 10^34 => rounding overflow
 		  C1_hi = 0x0000314dc6448d93ull;
 		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
 		  y_exp = y_exp + EXP_P1;
 		}
 	      } else
 		if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
 		    ((x_sign && (rnd_mode == ROUNDING_UP ||
 				 rnd_mode == ROUNDING_TO_ZERO)) ||
 		     (!x_sign && (rnd_mode == ROUNDING_DOWN ||
 				  rnd_mode == ROUNDING_TO_ZERO)))) {
 		// C1 = C1 - 1
 		C1_lo = C1_lo - 1;
 		if (C1_lo == 0xffffffffffffffffull)
 		  C1_hi--;
 		// check if we crossed into the lower decade
 		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
 		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
 		  C1_lo = 0x378d8e63ffffffffull;
 		  y_exp = y_exp - EXP_P1;
 		  // no underflow, because delta + q2 >= P34 + 1
 		}
 	      } else {
 		;	// exact, the result is already correct
 	      }
 	      // in all cases check for overflow (RN and RA solved already)
 	      if (y_exp == EXP_MAX_P1) {	// overflow
 		if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
 		    (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
 		  C1_hi = 0x7800000000000000ull;	// +inf
 		  C1_lo = 0x0ull;
 		} else {	// RM and res > 0, RP and res < 0, or RZ
 		  C1_hi = 0x5fffed09bead87c0ull;
 		  C1_lo = 0x378d8e63ffffffffull;
 		}
 		y_exp = 0;	// x_sign is preserved
 		// set the inexact flag (in case the exact addition was exact)
 		*pfpsf |= INEXACT_EXCEPTION;
 		// set the overflow flag
 		*pfpsf |= OVERFLOW_EXCEPTION;
 	      }
 	    }
 	  }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
 	  // assemble the result
 	  res.w[1] = x_sign | y_exp | C1_hi;
 	  res.w[0] = C1_lo;
 	} else {	// if x_sign != y_sign the result is exact
 	  C1_lo = C2_lo - C1_lo;
 	  C1_hi = C2_hi - C1_hi;
 	  if (C1_lo > C2_lo)
 	    C1_hi--;
 	  if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
 	    C1_lo = ~C1_lo;
 	    C1_lo++;
 	    C1_hi = ~C1_hi;
 	    if (C1_lo == 0x0)
 	      C1_hi++;
 	    x_sign = y_sign;	// the result will have the sign of y
 	  }
 	  // the result can be zero, but it cannot overflow
 	  if (C1_lo == 0 && C1_hi == 0) {
 	    // assemble the result
 	    if (x_exp < y_exp)
 	      res.w[1] = x_exp;
 	    else
 	      res.w[1] = y_exp;
 	    res.w[0] = 0;
 	    if (rnd_mode == ROUNDING_DOWN) {
 	      res.w[1] |= 0x8000000000000000ull;
 	    }
 	    BID_SWAP128 (res);
 	    BID_RETURN (res);
 	  }
 	  // assemble the result
 	  res.w[1] = y_sign | y_exp | C1_hi;
 	  res.w[0] = C1_lo;
 	}
       }
     }
     BID_SWAP128 (res);
     BID_RETURN (res)
   }
 }


 // bid128_sub stands for bid128qq_sub

 /*****************************************************************************
  *  BID128 sub
  ****************************************************************************/

 #if DECIMAL_CALL_BY_REFERENCE
 void
 bid128_sub (UINT128 * pres, UINT128 * px, UINT128 * py
 	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	    _EXC_INFO_PARAM) {
   UINT128 x = *px, y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT128
 bid128_sub (UINT128 x, UINT128 y
 	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
 	    _EXC_INFO_PARAM) {
 #endif

   UINT128 res;
   UINT64 y_sign;

   if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
     // change its sign
     y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
     if (y_sign)
       y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
     else
       y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
   }
 #if DECIMAL_CALL_BY_REFERENCE
   bid128_add (&res, &x, &y
 	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 	      _EXC_INFO_ARG);
 #else
   res = bid128_add (x, y
 		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
 		    _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }