lib/nanohub/rsa.c - device/google/contexthub - Git at Google

 /*
  * Copyright (C) 2016 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>
 #include <nanohub/rsa.h>


 static bool biModIterative(uint32_t *num, const uint32_t *denum, uint32_t *tmp, uint32_t *state1, uint32_t *state2, uint32_t step)
 //num %= denum where num is RSA_LEN * 2 and denum is RSA_LEN and tmp is RSA_LEN + limb_sz
 //will need to be called till it returns true (up to RSA_LEN * 2 + 2 times)
 {
     uint32_t bitsh = *state1, limbsh = *state2;
     bool ret = false;
     int64_t t;
     int32_t i;

     //first step is init
     if (!step) {
         //initially set it up left shifted as far as possible
         memcpy(tmp + 1, denum, RSA_BYTES);
         tmp[0] = 0;
         bitsh = 32;
         limbsh = RSA_LIMBS - 1;
         goto out;
     }

     //second is shifting denum
     if (step == 1) {
         while (!(tmp[RSA_LIMBS] & 0x80000000)) {
             for (i = RSA_LIMBS; i > 0; i--) {
                 tmp[i] <<= 1;
                 if (tmp[i - 1] & 0x80000000)
                     tmp[i]++;
             }
             //no need to adjust tmp[0] as it is still zero
             bitsh++;
         }
         goto out;
     }

     //all future steps do the division

     //check if we should subtract (uses less space than subtracting and unroling it later)
     for (i = RSA_LIMBS; i >= 0; i--) {
         if (num[limbsh + i] < tmp[i])
             goto dont_subtract;
         if (num[limbsh + i] > tmp[i])
             break;
     }

     //subtract
     t = 0;
     for (i = 0; i <= RSA_LIMBS; i++) {
         t += (uint64_t)num[limbsh + i];
         t -= (uint64_t)tmp[i];
         num[limbsh + i] = t;
         t >>= 32;
     }

     //carry the subtraction's carry to the end
     for (i = RSA_LIMBS + limbsh + 1; i < RSA_LIMBS * 2; i++) {
         t += (uint64_t)num[i];
         num[i] = t;
         t >>= 32;
     }

 dont_subtract:
     //handle bitshifts/refills
     if (!bitsh) {                          // tmp = denum << 32
         if (!limbsh) {
             ret = true;
             goto out;
         }

         memcpy(tmp + 1, denum, RSA_BYTES);
         tmp[0] = 0;
         bitsh = 32;
         limbsh--;
     }
     else {                                 // tmp >>= 1
         for (i = 0; i < RSA_LIMBS; i++) {
             tmp[i] >>= 1;
             if (tmp[i + 1] & 1)
                 tmp[i] += 0x80000000;
         }
         tmp[i] >>= 1;
         bitsh--;
     }


 out:
     *state1 = bitsh;
     *state2 = limbsh;
     return ret;
 }

 static void biMulIterative(uint32_t *ret, const uint32_t *a, const uint32_t *b, uint32_t step) //ret = a * b, call with step = [0..RSA_LIMBS)
 {
     uint32_t j, c;
     uint64_t r;

     //zero the result on first call
     if (!step)
         memset(ret, 0, RSA_BYTES * 2);

     //produce a partial sum & add it in
     c = 0;
     for (j = 0; j < RSA_LIMBS; j++) {
         r = (uint64_t)a[step] * b[j] + c + ret[step + j];
         ret[step + j] = r;
         c = r >> 32;
     }

     //carry the carry to the end
     for (j = step + RSA_LIMBS; j < RSA_LIMBS * 2; j++) {
         r = (uint64_t)ret[j] + c;
         ret[j] = r;
         c = r >> 32;
     }
 }

 /*
  * Piecewise RSA:
  * normal RSA public op with 65537 exponent does 34 operations. 17 muls and 17 mods, as follows:
  * 16x {mul, mod} to calculate a ^ 65536 mod c
  * 1x {mul, mod} to calculate a ^ 65537 mod c
  * we break up each mul and mod itself into more steps. mul needs RSA_LIMBS steps, and mod needs up to RSA_LEN * 2 + 2 steps
  * so if we allocate RSA_LEN * 3 step values to mod, each mul-mod pair will use <= RSA_LEN * 4 step values
  * and the whole opetaion will need <= RSA_LEN * 4 * 34 step values, which fits into a uint32. cool. In fact
  * some values will be skipped, but this makes life easier, really. Call this func with *stepP = 0, and keep calling till
  * output stepP is zero. We'll call each of the RSA_LEN * 4 pieces a gigastep, and have 17 of them as seen above. Each
  * will be logically separated into 4 megasteps. First will contain the MUL, last 3 the MOD and maybe the memcpy.
  * In the first 16 gigasteps, the very last step of the gigastep will be used for the memcpy call.
  *
  * The initial non-iterative RSA logic looks as follows, shown here for clarity:
  *
  *   memcpy(state->tmpB, a, RSA_BYTES);
  *   for (i = 0; i < 16; i++) {
  *       biMul(state->tmpA, state->tmpB, state->tmpB);
  *       biMod(state->tmpA, c, state->tmpB);
  *       memcpy(state->tmpB, state->tmpA, RSA_BYTES);
  *   }
  *
  *   //calculate a ^ 65537 mod c into state->tmpA [ at this point this means do state->tmpA = (state->tmpB * a) % c ]
  *   biMul(state->tmpA, state->tmpB, a);
  *   biMod(state->tmpA, c, state->tmpB);
  *
  *   //return result
  *   return state->tmpA;
  *
  */

 const uint32_t* rsaPubOpIterative(struct RsaState* state, const uint32_t *a, const uint32_t *c, uint32_t *state1, uint32_t *state2, uint32_t *stepP)
 {
     uint32_t step = *stepP, gigastep, gigastepBase, gigastepSubstep, megaSubstep;

     //step 0: copy a -> tmpB
     if (!step) {
         memcpy(state->tmpB, a, RSA_BYTES);
         step = 1;
     }
     else { //subsequent steps: do real work


         gigastep = (step - 1) / (RSA_LEN * 4);
         gigastepSubstep = (step - 1) % (RSA_LEN * 4);
         gigastepBase = gigastep * (RSA_LEN * 4);
         megaSubstep = gigastepSubstep / RSA_LEN;

         if (!megaSubstep) { // first megastep of the gigastep - MUL
             biMulIterative(state->tmpA, state->tmpB, gigastep == 16 ? a : state->tmpB, gigastepSubstep);
             if (gigastepSubstep == RSA_LIMBS - 1) //MUL is done - do mod next
                 step = gigastepBase + RSA_LEN + 1;
             else                                  //More of MUL is left to do
                 step++;
         }
         else if (gigastepSubstep != RSA_LEN * 4 - 1){   // second part of gigastep - MOD
             if (biModIterative(state->tmpA, c, state->tmpB, state1, state2, gigastepSubstep - RSA_LEN)) { //MOD is done
                 if (gigastep == 16) // we're done
                     step = 0;
                 else              // last part of the gigastep is a copy
                     step = gigastepBase + RSA_LEN * 4 - 1 + 1;
             }
             else
                 step++;
         }
         else {   //last part - memcpy
             memcpy(state->tmpB, state->tmpA, RSA_BYTES);
             step++;
         }
     }

     *stepP = step;
     return state->tmpA;
 }

 #if defined(RSA_SUPPORT_PRIV_OP_LOWRAM) || defined (RSA_SUPPORT_PRIV_OP_BIGRAM)
 #include <stdio.h>
 const uint32_t* rsaPubOp(struct RsaState* state, const uint32_t *a, const uint32_t *c)
 {
     const uint32_t *ret;
     uint32_t state1 = 0, state2 = 0, step = 0, ns = 0;

     do {
         ret = rsaPubOpIterative(state, a, c, &state1, &state2, &step);
         ns++;
     } while(step);

 fprintf(stderr, "steps: %u\n", ns);

     return ret;
 }

 static void biMod(uint32_t *num, const uint32_t *denum, uint32_t *tmp)
 {
     uint32_t state1 = 0, state2 = 0, step;

     for (step = 0; !biModIterative(num, denum, tmp, &state1, &state2, step); step++);
 }

 static void biMul(uint32_t *ret, const uint32_t *a, const uint32_t *b)
 {
     uint32_t step;

     for (step = 0; step < RSA_LIMBS; step++)
         biMulIterative(ret, a, b, step);
 }

 const uint32_t* rsaPrivOp(struct RsaState* state, const uint32_t *a, const uint32_t *b, const uint32_t *c)
 {
     uint32_t i;

     memcpy(state->tmpC, a, RSA_BYTES);  //tC will hold our powers of a

     memset(state->tmpA, 0, RSA_BYTES * 2); //tA will hold result
     state->tmpA[0] = 1;

     for (i = 0; i < RSA_LEN; i++) {
         //if the bit is set, multiply the current power of A into result
         if (b[i / 32] & (1 << (i % 32))) {
             memcpy(state->tmpB, state->tmpA, RSA_BYTES);
             biMul(state->tmpA, state->tmpB, state->tmpC);
             biMod(state->tmpA, c, state->tmpB);
         }

         //calculate the next power of a and modulus it
 #if defined(RSA_SUPPORT_PRIV_OP_LOWRAM)
         memcpy(state->tmpB, state->tmpA, RSA_BYTES); //save tA
         biMul(state->tmpA, state->tmpC, state->tmpC);
         biMod(state->tmpA, c, state->tmpC);
         memcpy(state->tmpC, state->tmpA, RSA_BYTES);
         memcpy(state->tmpA, state->tmpB, RSA_BYTES); //restore tA
 #elif defined (RSA_SUPPORT_PRIV_OP_BIGRAM)
         memcpy(state->tmpB, state->tmpC, RSA_BYTES);
         biMul(state->tmpC, state->tmpB, state->tmpB);
         biMod(state->tmpC, c, state->tmpB);
 #endif
     }

     return state->tmpA;
 }
 #endif
	/*
	* Copyright (C) 2016 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <stdint.h>
	#include <stdbool.h>
	#include <string.h>
	#include <nanohub/rsa.h>


	static bool biModIterative(uint32_t num, const uint32_t denum, uint32_t tmp, uint32_t state1, uint32_t *state2, uint32_t step)
	//num %= denum where num is RSA_LEN * 2 and denum is RSA_LEN and tmp is RSA_LEN + limb_sz
	//will need to be called till it returns true (up to RSA_LEN * 2 + 2 times)
	{
	uint32_t bitsh = state1, limbsh = state2;
	bool ret = false;
	int64_t t;
	int32_t i;

	//first step is init
	if (!step) {
	//initially set it up left shifted as far as possible
	memcpy(tmp + 1, denum, RSA_BYTES);
	tmp[0] = 0;
	bitsh = 32;
	limbsh = RSA_LIMBS - 1;
	goto out;
	}

	//second is shifting denum
	if (step == 1) {
	while (!(tmp[RSA_LIMBS] & 0x80000000)) {
	for (i = RSA_LIMBS; i > 0; i--) {
	tmp[i] <<= 1;
	if (tmp[i - 1] & 0x80000000)
	tmp[i]++;
	}
	//no need to adjust tmp[0] as it is still zero
	bitsh++;
	}
	goto out;
	}

	//all future steps do the division

	//check if we should subtract (uses less space than subtracting and unroling it later)
	for (i = RSA_LIMBS; i >= 0; i--) {
	if (num[limbsh + i] < tmp[i])
	goto dont_subtract;
	if (num[limbsh + i] > tmp[i])
	break;
	}

	//subtract
	t = 0;
	for (i = 0; i <= RSA_LIMBS; i++) {
	t += (uint64_t)num[limbsh + i];
	t -= (uint64_t)tmp[i];
	num[limbsh + i] = t;
	t >>= 32;
	}

	//carry the subtraction's carry to the end
	for (i = RSA_LIMBS + limbsh + 1; i < RSA_LIMBS * 2; i++) {
	t += (uint64_t)num[i];
	num[i] = t;
	t >>= 32;
	}

	dont_subtract:
	//handle bitshifts/refills
	if (!bitsh) { // tmp = denum << 32
	if (!limbsh) {
	ret = true;
	goto out;
	}

	memcpy(tmp + 1, denum, RSA_BYTES);
	tmp[0] = 0;
	bitsh = 32;
	limbsh--;
	}
	else { // tmp >>= 1
	for (i = 0; i < RSA_LIMBS; i++) {
	tmp[i] >>= 1;
	if (tmp[i + 1] & 1)
	tmp[i] += 0x80000000;
	}
	tmp[i] >>= 1;
	bitsh--;
	}


	out:
	*state1 = bitsh;
	*state2 = limbsh;
	return ret;
	}

	static void biMulIterative(uint32_t ret, const uint32_t a, const uint32_t b, uint32_t step) //ret = a b, call with step = [0..RSA_LIMBS)
	{
	uint32_t j, c;
	uint64_t r;

	//zero the result on first call
	if (!step)
	memset(ret, 0, RSA_BYTES * 2);

	//produce a partial sum & add it in
	c = 0;
	for (j = 0; j < RSA_LIMBS; j++) {
	r = (uint64_t)a[step] * b[j] + c + ret[step + j];
	ret[step + j] = r;
	c = r >> 32;
	}

	//carry the carry to the end
	for (j = step + RSA_LIMBS; j < RSA_LIMBS * 2; j++) {
	r = (uint64_t)ret[j] + c;
	ret[j] = r;
	c = r >> 32;
	}
	}

	/*
	* Piecewise RSA:
	* normal RSA public op with 65537 exponent does 34 operations. 17 muls and 17 mods, as follows:
	* 16x {mul, mod} to calculate a ^ 65536 mod c
	* 1x {mul, mod} to calculate a ^ 65537 mod c
	* we break up each mul and mod itself into more steps. mul needs RSA_LIMBS steps, and mod needs up to RSA_LEN * 2 + 2 steps
	* so if we allocate RSA_LEN * 3 step values to mod, each mul-mod pair will use <= RSA_LEN * 4 step values
	* and the whole opetaion will need <= RSA_LEN * 4 * 34 step values, which fits into a uint32. cool. In fact
	* some values will be skipped, but this makes life easier, really. Call this func with *stepP = 0, and keep calling till
	* output stepP is zero. We'll call each of the RSA_LEN * 4 pieces a gigastep, and have 17 of them as seen above. Each
	* will be logically separated into 4 megasteps. First will contain the MUL, last 3 the MOD and maybe the memcpy.
	* In the first 16 gigasteps, the very last step of the gigastep will be used for the memcpy call.
	*
	* The initial non-iterative RSA logic looks as follows, shown here for clarity:
	*
	* memcpy(state->tmpB, a, RSA_BYTES);
	* for (i = 0; i < 16; i++) {
	* biMul(state->tmpA, state->tmpB, state->tmpB);
	* biMod(state->tmpA, c, state->tmpB);
	* memcpy(state->tmpB, state->tmpA, RSA_BYTES);
	* }
	*
	* //calculate a ^ 65537 mod c into state->tmpA [ at this point this means do state->tmpA = (state->tmpB * a) % c ]
	* biMul(state->tmpA, state->tmpB, a);
	* biMod(state->tmpA, c, state->tmpB);
	*
	* //return result
	* return state->tmpA;
	*
	*/

	const uint32_t* rsaPubOpIterative(struct RsaState* state, const uint32_t a, const uint32_t c, uint32_t state1, uint32_t state2, uint32_t *stepP)
	{
	uint32_t step = *stepP, gigastep, gigastepBase, gigastepSubstep, megaSubstep;

	//step 0: copy a -> tmpB
	if (!step) {
	memcpy(state->tmpB, a, RSA_BYTES);
	step = 1;
	}
	else { //subsequent steps: do real work


	gigastep = (step - 1) / (RSA_LEN * 4);
	gigastepSubstep = (step - 1) % (RSA_LEN * 4);
	gigastepBase = gigastep * (RSA_LEN * 4);
	megaSubstep = gigastepSubstep / RSA_LEN;

	if (!megaSubstep) { // first megastep of the gigastep - MUL
	biMulIterative(state->tmpA, state->tmpB, gigastep == 16 ? a : state->tmpB, gigastepSubstep);
	if (gigastepSubstep == RSA_LIMBS - 1) //MUL is done - do mod next
	step = gigastepBase + RSA_LEN + 1;
	else //More of MUL is left to do
	step++;
	}
	else if (gigastepSubstep != RSA_LEN * 4 - 1){ // second part of gigastep - MOD
	if (biModIterative(state->tmpA, c, state->tmpB, state1, state2, gigastepSubstep - RSA_LEN)) { //MOD is done
	if (gigastep == 16) // we're done
	step = 0;
	else // last part of the gigastep is a copy
	step = gigastepBase + RSA_LEN * 4 - 1 + 1;
	}
	else
	step++;
	}
	else { //last part - memcpy
	memcpy(state->tmpB, state->tmpA, RSA_BYTES);
	step++;
	}
	}

	*stepP = step;
	return state->tmpA;
	}

	#if defined(RSA_SUPPORT_PRIV_OP_LOWRAM) \|\| defined (RSA_SUPPORT_PRIV_OP_BIGRAM)
	#include <stdio.h>
	const uint32_t* rsaPubOp(struct RsaState* state, const uint32_t a, const uint32_t c)
	{
	const uint32_t *ret;
	uint32_t state1 = 0, state2 = 0, step = 0, ns = 0;

	do {
	ret = rsaPubOpIterative(state, a, c, &state1, &state2, &step);
	ns++;
	} while(step);

	fprintf(stderr, "steps: %u\n", ns);

	return ret;
	}

	static void biMod(uint32_t num, const uint32_t denum, uint32_t *tmp)
	{
	uint32_t state1 = 0, state2 = 0, step;

	for (step = 0; !biModIterative(num, denum, tmp, &state1, &state2, step); step++);
	}

	static void biMul(uint32_t ret, const uint32_t a, const uint32_t *b)
	{
	uint32_t step;

	for (step = 0; step < RSA_LIMBS; step++)
	biMulIterative(ret, a, b, step);
	}

	const uint32_t* rsaPrivOp(struct RsaState* state, const uint32_t a, const uint32_t b, const uint32_t *c)
	{
	uint32_t i;

	memcpy(state->tmpC, a, RSA_BYTES); //tC will hold our powers of a

	memset(state->tmpA, 0, RSA_BYTES * 2); //tA will hold result
	state->tmpA[0] = 1;

	for (i = 0; i < RSA_LEN; i++) {
	//if the bit is set, multiply the current power of A into result
	if (b[i / 32] & (1 << (i % 32))) {
	memcpy(state->tmpB, state->tmpA, RSA_BYTES);
	biMul(state->tmpA, state->tmpB, state->tmpC);
	biMod(state->tmpA, c, state->tmpB);
	}

	//calculate the next power of a and modulus it
	#if defined(RSA_SUPPORT_PRIV_OP_LOWRAM)
	memcpy(state->tmpB, state->tmpA, RSA_BYTES); //save tA
	biMul(state->tmpA, state->tmpC, state->tmpC);
	biMod(state->tmpA, c, state->tmpC);
	memcpy(state->tmpC, state->tmpA, RSA_BYTES);
	memcpy(state->tmpA, state->tmpB, RSA_BYTES); //restore tA
	#elif defined (RSA_SUPPORT_PRIV_OP_BIGRAM)
	memcpy(state->tmpB, state->tmpC, RSA_BYTES);
	biMul(state->tmpC, state->tmpB, state->tmpB);
	biMod(state->tmpC, c, state->tmpB);
	#endif
	}

	return state->tmpA;
	}
	#endif