/* sp.c
 *
 * Copyright (C) 2006-2024 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

/* Implementation by Sean Parkinson. */

#ifdef HAVE_CONFIG_H
    #include <config.h>
#endif

#include <wolfssl/wolfcrypt/settings.h>

#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(WOLFSSL_HAVE_SP_ECC)

#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/cpuid.h>
#ifdef NO_INLINE
    #include <wolfssl/wolfcrypt/misc.h>
#else
    #define WOLFSSL_MISC_INCLUDED
    #include <wolfcrypt/src/misc.c>
#endif

#ifdef RSA_LOW_MEM
#ifndef WOLFSSL_SP_SMALL
#define WOLFSSL_SP_SMALL
#endif
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
#undef WOLFSSL_SP_SMALL_STACK
#define WOLFSSL_SP_SMALL_STACK
#endif

#include <wolfssl/wolfcrypt/sp.h>

#ifdef __IAR_SYSTEMS_ICC__
#define __asm__        asm
#define __volatile__   volatile
#define WOLFSSL_NO_VAR_ASSIGN_REG
#endif /* __IAR_SYSTEMS_ICC__ */
#ifdef __KEIL__
#define __asm__        __asm
#define __volatile__   volatile
#endif

#ifdef WOLFSSL_SP_ARM_CORTEX_M_ASM
#define SP_PRINT_NUM(var, name, total, words, bits)         \
    do {                                                    \
        int ii;                                             \
        fprintf(stderr, name "=0x");                        \
        for (ii = (((bits) + 31) / 32) - 1; ii >= 0; ii--)  \
            fprintf(stderr, SP_PRINT_FMT, (var)[ii]);       \
        fprintf(stderr, "\n");                              \
    } while (0)

#define SP_PRINT_VAL(var, name)                             \
    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)

#define SP_PRINT_INT(var, name)                             \
    fprintf(stderr, name "=%d\n", var)

#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
#ifndef WOLFSSL_SP_NO_2048
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j;
    byte* d;

    j = 0;
    for (i = n - 1; i >= 3; i -= 4) {
        r[j]  = ((sp_digit)a[i - 0] <<  0) |
                ((sp_digit)a[i - 1] <<  8) |
                ((sp_digit)a[i - 2] << 16) |
                ((sp_digit)a[i - 3] << 24);
        j++;
    }

    if (i >= 0) {
        r[j] = 0;

        d = (byte*)(r + j);
#ifdef BIG_ENDIAN_ORDER
        switch (i) {
            case 2: d[1] = *(a++); //fallthrough
            case 1: d[2] = *(a++); //fallthrough
            case 0: d[3] = *a    ; //fallthrough
        }
#else
        switch (i) {
            case 2: d[2] = a[2]; //fallthrough
            case 1: d[1] = a[1]; //fallthrough
            case 0: d[0] = a[0]; //fallthrough
        }
#endif
        j++;
    }

    for (; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 32
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 31);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 31);
    }
#elif DIGIT_BIT > 32
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0xffffffff;
        s = 32U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 32U) <= (word32)DIGIT_BIT) {
            s += 32U;
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 32) {
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 32 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 256
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_2048_to_bin_64(sp_digit* r, byte* a)
{
    int i;
    int j = 0;

    for (i = 63; i >= 0; i--) {
        a[j++] = r[i] >> 24;
        a[j++] = r[i] >> 16;
        a[j++] = r[i] >> 8;
        a[j++] = r[i] >> 0;
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH)
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_2048_norm_64(a)

#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_2048_norm_64(a)

#ifndef WOLFSSL_SP_SMALL
#ifdef WOLFSSL_ARM_ARCH_7M
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x24\n\t"
        "STR	%[r], [sp, #32]\n\t"
        "MOV	%[r], #0x0\n\t"
        "LDR	r12, [%[a]]\n\t"
        /* A[0] * B[0] */
        "LDR	lr, [%[b]]\n\t"
        "UMULL	r3, r4, r12, lr\n\t"
        /* A[0] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "UMULL	r5, r6, r12, lr\n\t"
        /* A[0] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "UMULL	r7, r8, r12, lr\n\t"
        /* A[0] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "UMULL	r9, r10, r12, lr\n\t"
        "STR	r3, [sp]\n\t"
        /* A[0] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "MOV	r11, %[r]\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[0] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADCS	r6, r6, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[0] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADCS	r8, r8, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[0] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADCS	r10, r10, #0x0\n\t"
        "ADC	r3, %[r], #0x0\n\t"
        "UMLAL	r10, r3, r12, lr\n\t"
        /* A[1] * B[0] */
        "LDR	r12, [%[a], #4]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "STR	r4, [sp, #4]\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[1] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[1] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[1] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[1] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[1] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[1] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[1] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r4, %[r], #0x0\n\t"
        "UMLAL	r3, r4, r12, lr\n\t"
        /* A[2] * B[0] */
        "LDR	r12, [%[a], #8]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "STR	r5, [sp, #8]\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[2] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[2] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[2] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[2] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[2] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[2] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[2] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r5, %[r], #0x0\n\t"
        "UMLAL	r4, r5, r12, lr\n\t"
        /* A[3] * B[0] */
        "LDR	r12, [%[a], #12]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "STR	r6, [sp, #12]\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[3] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[3] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[3] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[3] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[3] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[3] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[3] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r6, %[r], #0x0\n\t"
        "UMLAL	r5, r6, r12, lr\n\t"
        /* A[4] * B[0] */
        "LDR	r12, [%[a], #16]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "STR	r7, [sp, #16]\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[4] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[4] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[4] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[4] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[4] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[4] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[4] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r7, %[r], #0x0\n\t"
        "UMLAL	r6, r7, r12, lr\n\t"
        /* A[5] * B[0] */
        "LDR	r12, [%[a], #20]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "STR	r8, [sp, #20]\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[5] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[5] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[5] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[5] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[5] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[5] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[5] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r8, %[r], #0x0\n\t"
        "UMLAL	r7, r8, r12, lr\n\t"
        /* A[6] * B[0] */
        "LDR	r12, [%[a], #24]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "STR	r9, [sp, #24]\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[6] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[6] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[6] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[6] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[6] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[6] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[6] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r9, %[r], #0x0\n\t"
        "UMLAL	r8, r9, r12, lr\n\t"
        /* A[7] * B[0] */
        "LDR	r12, [%[a], #28]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "STR	r10, [sp, #28]\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[7] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[7] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[7] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[7] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[7] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[7] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[7] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r10, %[r], #0x0\n\t"
        "UMLAL	r9, r10, r12, lr\n\t"
        "LDR	%[r], [sp, #32]\n\t"
        "ADD	%[r], %[r], #0x20\n\t"
        "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "LDM	sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "SUB	%[r], %[r], #0x20\n\t"
        "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "ADD	sp, sp, #0x24\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
}

#else
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x2c\n\t"
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
        "STRD	%[r], %[a], [sp, #36]\n\t"
#else
        "STR	%[r], [sp, #36]\n\t"
        "STR	%[a], [sp, #40]\n\t"
#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
        "MOV	lr, %[b]\n\t"
        "LDM	%[a], {r0, r1, r2, r3}\n\t"
        "LDM	lr!, {r4, r5, r6}\n\t"
        "UMULL	r10, r11, r0, r4\n\t"
        "UMULL	r12, r7, r1, r4\n\t"
        "UMAAL	r11, r12, r0, r5\n\t"
        "UMULL	r8, r9, r2, r4\n\t"
        "UMAAL	r12, r8, r1, r5\n\t"
        "UMAAL	r12, r7, r0, r6\n\t"
        "UMAAL	r8, r9, r3, r4\n\t"
        "STM	sp, {r10, r11, r12}\n\t"
        "UMAAL	r7, r8, r2, r5\n\t"
        "LDM	lr!, {r4}\n\t"
        "UMULL	r10, r11, r1, r6\n\t"
        "UMAAL	r8, r9, r2, r6\n\t"
        "UMAAL	r7, r10, r0, r4\n\t"
        "UMAAL	r8, r11, r3, r5\n\t"
        "STR	r7, [sp, #12]\n\t"
        "UMAAL	r8, r10, r1, r4\n\t"
        "UMAAL	r9, r11, r3, r6\n\t"
        "UMAAL	r9, r10, r2, r4\n\t"
        "UMAAL	r10, r11, r3, r4\n\t"
        "LDM	lr, {r4, r5, r6, r7}\n\t"
        "MOV	r12, #0x0\n\t"
        "UMLAL	r8, r12, r0, r4\n\t"
        "UMAAL	r9, r12, r1, r4\n\t"
        "UMAAL	r10, r12, r2, r4\n\t"
        "UMAAL	r11, r12, r3, r4\n\t"
        "MOV	r4, #0x0\n\t"
        "UMLAL	r9, r4, r0, r5\n\t"
        "UMAAL	r10, r4, r1, r5\n\t"
        "UMAAL	r11, r4, r2, r5\n\t"
        "UMAAL	r12, r4, r3, r5\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r0, r6\n\t"
        "UMAAL	r11, r5, r1, r6\n\t"
        "UMAAL	r12, r5, r2, r6\n\t"
        "UMAAL	r4, r5, r3, r6\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r11, r6, r0, r7\n\t"
        "LDR	r0, [sp, #40]\n\t"
        "UMAAL	r12, r6, r1, r7\n\t"
        "ADD	r0, r0, #0x10\n\t"
        "UMAAL	r4, r6, r2, r7\n\t"
        "SUB	lr, lr, #0x10\n\t"
        "UMAAL	r5, r6, r3, r7\n\t"
        "LDM	r0, {r0, r1, r2, r3}\n\t"
        "STR	r6, [sp, #32]\n\t"
        "LDM	lr!, {r6}\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r8, r7, r0, r6\n\t"
        "UMAAL	r9, r7, r1, r6\n\t"
        "STR	r8, [sp, #16]\n\t"
        "UMAAL	r10, r7, r2, r6\n\t"
        "UMAAL	r11, r7, r3, r6\n\t"
        "LDM	lr!, {r6}\n\t"
        "MOV	r8, #0x0\n\t"
        "UMLAL	r9, r8, r0, r6\n\t"
        "UMAAL	r10, r8, r1, r6\n\t"
        "STR	r9, [sp, #20]\n\t"
        "UMAAL	r11, r8, r2, r6\n\t"
        "UMAAL	r12, r8, r3, r6\n\t"
        "LDM	lr!, {r6}\n\t"
        "MOV	r9, #0x0\n\t"
        "UMLAL	r10, r9, r0, r6\n\t"
        "UMAAL	r11, r9, r1, r6\n\t"
        "STR	r10, [sp, #24]\n\t"
        "UMAAL	r12, r9, r2, r6\n\t"
        "UMAAL	r4, r9, r3, r6\n\t"
        "LDM	lr!, {r6}\n\t"
        "MOV	r10, #0x0\n\t"
        "UMLAL	r11, r10, r0, r6\n\t"
        "UMAAL	r12, r10, r1, r6\n\t"
        "STR	r11, [sp, #28]\n\t"
        "UMAAL	r4, r10, r2, r6\n\t"
        "UMAAL	r5, r10, r3, r6\n\t"
        "LDM	lr!, {r11}\n\t"
        "UMAAL	r12, r7, r0, r11\n\t"
        "UMAAL	r4, r7, r1, r11\n\t"
        "LDR	r6, [sp, #32]\n\t"
        "UMAAL	r5, r7, r2, r11\n\t"
        "UMAAL	r6, r7, r3, r11\n\t"
        "LDM	lr!, {r11}\n\t"
        "UMAAL	r4, r8, r0, r11\n\t"
        "UMAAL	r5, r8, r1, r11\n\t"
        "UMAAL	r6, r8, r2, r11\n\t"
        "UMAAL	r7, r8, r3, r11\n\t"
        "LDM	lr, {r11, lr}\n\t"
        "UMAAL	r5, r9, r0, r11\n\t"
        "UMAAL	r6, r10, r0, lr\n\t"
        "UMAAL	r6, r9, r1, r11\n\t"
        "UMAAL	r7, r10, r1, lr\n\t"
        "UMAAL	r7, r9, r2, r11\n\t"
        "UMAAL	r8, r10, r2, lr\n\t"
        "UMAAL	r8, r9, r3, r11\n\t"
        "UMAAL	r9, r10, r3, lr\n\t"
        "MOV	r3, r12\n\t"
        "LDR	lr, [sp, #36]\n\t"
        "ADD	lr, lr, #0x20\n\t"
        "STM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "SUB	lr, lr, #0x20\n\t"
        "LDM	sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "STM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "ADD	sp, sp, #0x2c\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr", "cc"
    );
}

#endif /* WOLFSSL_ARM_ARCH_7M */
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADDS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "MOV	%[r], #0x0\n\t"
        "ADC	%[r], %[r], #0x0\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_in_place_16(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_in_place_16(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SUBS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADDS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "MOV	%[r], #0x0\n\t"
        "ADC	%[r], %[r], #0x0\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_8(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<8; i++) {
        r[i] = a[i] & m;
    }
#else
    r[0] = a[0] & m;
    r[1] = a[1] & m;
    r[2] = a[2] & m;
    r[3] = a[3] & m;
    r[4] = a[4] & m;
    r[5] = a[5] & m;
    r[6] = a[6] & m;
    r[7] = a[7] & m;
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[16];
    sp_digit a1[8];
    sp_digit b1[8];
    sp_digit* z2 = r + 16;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_2048_add_8(a1, a, &a[8]);
    cb = sp_2048_add_8(b1, b, &b[8]);
    u  = ca & cb;

    sp_2048_mul_8(z2, &a[8], &b[8]);
    sp_2048_mul_8(z0, a, b);
    sp_2048_mul_8(z1, a1, b1);

    u += sp_2048_sub_in_place_16(z1, z0);
    u += sp_2048_sub_in_place_16(z1, z2);
    sp_2048_mask_8(a1, a1, 0 - cb);
    u += sp_2048_add_8(z1 + 8, z1 + 8, a1);
    sp_2048_mask_8(b1, b1, 0 - ca);
    u += sp_2048_add_8(z1 + 8, z1 + 8, b1);

    u += sp_2048_add_16(r + 8, r + 8, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (8 - 1));
    a1[0] = u;
    (void)sp_2048_add_8(r + 24, r + 24, a1);
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SUBS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADDS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "MOV	%[r], #0x0\n\t"
        "ADC	%[r], %[r], #0x0\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<16; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit b1[16];
    sp_digit* z2 = r + 32;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_2048_add_16(a1, a, &a[16]);
    cb = sp_2048_add_16(b1, b, &b[16]);
    u  = ca & cb;

    sp_2048_mul_16(z2, &a[16], &b[16]);
    sp_2048_mul_16(z0, a, b);
    sp_2048_mul_16(z1, a1, b1);

    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_sub_in_place_32(z1, z2);
    sp_2048_mask_16(a1, a1, 0 - cb);
    u += sp_2048_add_16(z1 + 16, z1 + 16, a1);
    sp_2048_mask_16(b1, b1, 0 - ca);
    u += sp_2048_add_16(z1 + 16, z1 + 16, b1);

    u += sp_2048_add_32(r + 16, r + 16, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (16 - 1));
    a1[0] = u;
    (void)sp_2048_add_16(r + 48, r + 48, a1);
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_in_place_64(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SUBS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADDS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "MOV	%[r], #0x0\n\t"
        "ADC	%[r], %[r], #0x0\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<32; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[64];
    sp_digit a1[32];
    sp_digit b1[32];
    sp_digit* z2 = r + 64;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_2048_add_32(a1, a, &a[32]);
    cb = sp_2048_add_32(b1, b, &b[32]);
    u  = ca & cb;

    sp_2048_mul_32(z2, &a[32], &b[32]);
    sp_2048_mul_32(z0, a, b);
    sp_2048_mul_32(z1, a1, b1);

    u += sp_2048_sub_in_place_64(z1, z0);
    u += sp_2048_sub_in_place_64(z1, z2);
    sp_2048_mask_32(a1, a1, 0 - cb);
    u += sp_2048_add_32(z1 + 32, z1 + 32, a1);
    sp_2048_mask_32(b1, b1, 0 - ca);
    u += sp_2048_add_32(z1 + 32, z1 + 32, b1);

    u += sp_2048_add_64(r + 32, r + 32, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (32 - 1));
    a1[0] = u;
    (void)sp_2048_add_32(r + 96, r + 96, a1);
}

#ifdef WOLFSSL_ARM_ARCH_7M
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p)
#else
SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x44\n\t"
        "STR	%[r], [sp, #64]\n\t"
        "MOV	%[r], #0x0\n\t"
        "LDR	r12, [%[a]]\n\t"
        /* A[0] * A[1] */
        "LDR	lr, [%[a], #4]\n\t"
        "UMULL	r4, r5, r12, lr\n\t"
        /* A[0] * A[3] */
        "LDR	lr, [%[a], #12]\n\t"
        "UMULL	r6, r7, r12, lr\n\t"
        /* A[0] * A[5] */
        "LDR	lr, [%[a], #20]\n\t"
        "UMULL	r8, r9, r12, lr\n\t"
        /* A[0] * A[7] */
        "LDR	lr, [%[a], #28]\n\t"
        "UMULL	r10, r3, r12, lr\n\t"
        /* A[0] * A[2] */
        "LDR	lr, [%[a], #8]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[0] * A[4] */
        "LDR	lr, [%[a], #16]\n\t"
        "ADCS	r7, r7, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[0] * A[6] */
        "LDR	lr, [%[a], #24]\n\t"
        "ADCS	r9, r9, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        "ADCS	r3, r3, #0x0\n\t"
        "STR	r4, [sp, #4]\n\t"
        "STR	r5, [sp, #8]\n\t"
        /* A[1] * A[2] */
        "LDR	r12, [%[a], #4]\n\t"
        "LDR	lr, [%[a], #8]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "STR	r6, [sp, #12]\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[1] * A[3] */
        "LDR	lr, [%[a], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "STR	r7, [sp, #16]\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[1] * A[4] */
        "LDR	lr, [%[a], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[1] * A[5] */
        "LDR	lr, [%[a], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[1] * A[6] */
        "LDR	lr, [%[a], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[1] * A[7] */
        "LDR	lr, [%[a], #28]\n\t"
        "ADC	r4, %[r], #0x0\n\t"
        "UMLAL	r3, r4, r12, lr\n\t"
        /* A[2] * A[3] */
        "LDR	r12, [%[a], #8]\n\t"
        "LDR	lr, [%[a], #12]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "STR	r8, [sp, #20]\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[2] * A[4] */
        "LDR	lr, [%[a], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "STR	r9, [sp, #24]\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[2] * A[5] */
        "LDR	lr, [%[a], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[2] * A[6] */
        "LDR	lr, [%[a], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[2] * A[7] */
        "LDR	lr, [%[a], #28]\n\t"
        "ADC	r5, %[r], #0x0\n\t"
        "UMLAL	r4, r5, r12, lr\n\t"
        /* A[3] * A[4] */
        "LDR	r12, [%[a], #12]\n\t"
        "LDR	lr, [%[a], #16]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "STR	r10, [sp, #28]\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[3] * A[5] */
        "LDR	lr, [%[a], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[3] * A[6] */
        "LDR	lr, [%[a], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[3] * A[7] */
        "LDR	lr, [%[a], #28]\n\t"
        "ADC	r6, %[r], #0x0\n\t"
        "UMLAL	r5, r6, r12, lr\n\t"
        /* A[4] * A[5] */
        "LDR	r12, [%[a], #16]\n\t"
        "LDR	lr, [%[a], #20]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[4] * A[6] */
        "LDR	lr, [%[a], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[4] * A[7] */
        "LDR	lr, [%[a], #28]\n\t"
        "ADC	r7, %[r], #0x0\n\t"
        "UMLAL	r6, r7, r12, lr\n\t"
        /* A[5] * A[6] */
        "LDR	r12, [%[a], #20]\n\t"
        "LDR	lr, [%[a], #24]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[5] * A[7] */
        "LDR	lr, [%[a], #28]\n\t"
        "ADC	r8, %[r], #0x0\n\t"
        "UMLAL	r7, r8, r12, lr\n\t"
        /* A[6] * A[7] */
        "LDR	r12, [%[a], #24]\n\t"
        "LDR	lr, [%[a], #28]\n\t"
        "MOV	r9, #0x0\n\t"
        "UMLAL	r8, r9, r12, lr\n\t"
        "ADD	lr, sp, #0x20\n\t"
        "STM	lr, {r3, r4, r5, r6, r7, r8, r9}\n\t"
        "ADD	lr, sp, #0x4\n\t"
        "LDM	lr, {r4, r5, r6, r7, r8, r9, r10}\n\t"
        "ADDS	r4, r4, r4\n\t"
        "ADCS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADCS	r7, r7, r7\n\t"
        "ADCS	r8, r8, r8\n\t"
        "ADCS	r9, r9, r9\n\t"
        "ADCS	r10, r10, r10\n\t"
        "STM	lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t"
        "LDM	lr, {r3, r4, r5, r6, r7, r8, r9}\n\t"
        "ADCS	r3, r3, r3\n\t"
        "ADCS	r4, r4, r4\n\t"
        "ADCS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADCS	r7, r7, r7\n\t"
        "ADCS	r8, r8, r8\n\t"
        "ADCS	r9, r9, r9\n\t"
        "ADC	r10, %[r], #0x0\n\t"
        "STM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "ADD	lr, sp, #0x4\n\t"
        "LDM	lr, {r4, r5, r6, r7, r8, r9, r10}\n\t"
        "MOV	lr, sp\n\t"
        /* A[0] * A[0] */
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r3, r11, r12, r12\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[1] * A[1] */
        "LDR	r12, [%[a], #4]\n\t"
        "ADCS	r5, r5, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, r12\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[2] * A[2] */
        "LDR	r12, [%[a], #8]\n\t"
        "ADCS	r7, r7, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, r12\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[3] * A[3] */
        "LDR	r12, [%[a], #12]\n\t"
        "ADCS	r9, r9, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, r12\n\t"
        "ADDS	r10, r10, r11\n\t"
        "STM	lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "LDM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        /* A[4] * A[4] */
        "LDR	r12, [%[a], #16]\n\t"
        "ADCS	r3, r3, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, r12\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[5] * A[5] */
        "LDR	r12, [%[a], #20]\n\t"
        "ADCS	r5, r5, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, r12\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[6] * A[6] */
        "LDR	r12, [%[a], #24]\n\t"
        "ADCS	r7, r7, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, r12\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[7] * A[7] */
        "LDR	r12, [%[a], #28]\n\t"
        "ADCS	r9, r9, #0x0\n\t"
        "ADC	r10, r10, #0x0\n\t"
        "UMLAL	r9, r10, r12, r12\n\t"
        "LDR	%[r], [sp, #64]\n\t"
        "ADD	%[r], %[r], #0x20\n\t"
        "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "LDM	sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "SUB	%[r], %[r], #0x20\n\t"
        "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "ADD	sp, sp, #0x44\n\t"
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
}

#else
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p)
#else
SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x20\n\t"
        "STR	%[r], [sp, #28]\n\t"
        "LDM	%[a], {r0, r1, r2, r3, r4, r5, r6, r7}\n\t"
        "UMULL	r9, r10, r0, r0\n\t"
        "UMULL	r11, r12, r0, r1\n\t"
        "ADDS	r11, r11, r11\n\t"
        "MOV	lr, #0x0\n\t"
        "UMAAL	r10, r11, lr, lr\n\t"
        "STM	sp, {r9, r10}\n\t"
        "MOV	r8, lr\n\t"
        "UMAAL	r8, r12, r0, r2\n\t"
        "ADCS	r8, r8, r8\n\t"
        "UMAAL	r8, r11, r1, r1\n\t"
        "UMULL	r9, r10, r0, r3\n\t"
        "UMAAL	r9, r12, r1, r2\n\t"
        "ADCS	r9, r9, r9\n\t"
        "UMAAL	r9, r11, lr, lr\n\t"
        "STRD	r8, r9, [sp, #8]\n\t"
        "MOV	r9, lr\n\t"
        "UMAAL	r9, r10, r0, r4\n\t"
        "UMAAL	r9, r12, r1, r3\n\t"
        "ADCS	r9, r9, r9\n\t"
        "UMAAL	r9, r11, r2, r2\n\t"
        "STR	r9, [sp, #16]\n\t"
        "UMULL	r9, r8, r0, r5\n\t"
        "UMAAL	r9, r12, r1, r4\n\t"
        "UMAAL	r9, r10, r2, r3\n\t"
        "ADCS	r9, r9, r9\n\t"
        "UMAAL	r9, r11, lr, lr\n\t"
        "STR	r9, [sp, #20]\n\t"
        "MOV	r9, lr\n\t"
        "UMAAL	r9, r8, r0, r6\n\t"
        "UMAAL	r9, r12, r1, r5\n\t"
        "UMAAL	r9, r10, r2, r4\n\t"
        "ADCS	r9, r9, r9\n\t"
        "UMAAL	r9, r11, r3, r3\n\t"
        "STR	r9, [sp, #24]\n\t"
        "UMULL	r0, r9, r0, r7\n\t"
        "UMAAL	r0, r8, r1, r6\n\t"
        "UMAAL	r0, r12, r2, r5\n\t"
        "UMAAL	r0, r10, r3, r4\n\t"
        "ADCS	r0, r0, r0\n\t"
        "UMAAL	r0, r11, lr, lr\n\t"
        /* R[7] = r0 */
        "UMAAL	r9, r8, r1, r7\n\t"
        "UMAAL	r9, r10, r2, r6\n\t"
        "UMAAL	r12, r9, r3, r5\n\t"
        "ADCS	r12, r12, r12\n\t"
        "UMAAL	r12, r11, r4, r4\n\t"
        /* R[8] = r12 */
        "UMAAL	r9, r8, r2, r7\n\t"
        "UMAAL	r10, r9, r3, r6\n\t"
        "MOV	r2, lr\n\t"
        "UMAAL	r10, r2, r4, r5\n\t"
        "ADCS	r10, r10, r10\n\t"
        "UMAAL	r11, r10, lr, lr\n\t"
        /* R[9] = r11 */
        "UMAAL	r2, r8, r3, r7\n\t"
        "UMAAL	r2, r9, r4, r6\n\t"
        "ADCS	r3, r2, r2\n\t"
        "UMAAL	r10, r3, r5, r5\n\t"
        /* R[10] = r10 */
        "MOV	r1, lr\n\t"
        "UMAAL	r1, r8, r4, r7\n\t"
        "UMAAL	r1, r9, r5, r6\n\t"
        "ADCS	r4, r1, r1\n\t"
        "UMAAL	r3, r4, lr, lr\n\t"
        /* R[11] = r3 */
        "UMAAL	r8, r9, r5, r7\n\t"
        "ADCS	r8, r8, r8\n\t"
        "UMAAL	r4, r8, r6, r6\n\t"
        /* R[12] = r4 */
        "MOV	r5, lr\n\t"
        "UMAAL	r5, r9, r6, r7\n\t"
        "ADCS	r5, r5, r5\n\t"
        "UMAAL	r8, r5, lr, lr\n\t"
        /* R[13] = r8 */
        "ADCS	r9, r9, r9\n\t"
        "UMAAL	r9, r5, r7, r7\n\t"
        "ADCS	r7, r5, lr\n\t"
        /* R[14] = r9 */
        /* R[15] = r7 */
        "LDR	lr, [sp, #28]\n\t"
        "ADD	lr, lr, #0x1c\n\t"
        "STM	lr!, {r0, r12}\n\t"
        "STM	lr!, {r11}\n\t"
        "STM	lr!, {r10}\n\t"
        "STM	lr!, {r3, r4, r8, r9}\n\t"
        "STM	lr!, {r7}\n\t"
        "SUB	lr, lr, #0x40\n\t"
        "LDM	sp, {r0, r1, r2, r3, r4, r5, r6}\n\t"
        "STM	lr, {r0, r1, r2, r3, r4, r5, r6}\n\t"
        "ADD	sp, sp, #0x20\n\t"
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
}

#endif /* WOLFSSL_ARM_ARCH_7M */
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SUBS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 16;
    sp_digit z1[16];
    sp_digit* a1 = z1;
    sp_digit zero[8];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 8);

    mask = sp_2048_sub_8(a1, a, &a[8]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_2048_sub_8(a1, p1, p2);

    sp_2048_sqr_8(z2, &a[8]);
    sp_2048_sqr_8(z0, a);
    sp_2048_sqr_8(z1, a1);

    u = 0;
    u -= sp_2048_sub_in_place_16(z1, z2);
    u -= sp_2048_sub_in_place_16(z1, z0);
    u += sp_2048_sub_in_place_16(r + 8, z1);
    zero[0] = u;
    (void)sp_2048_add_8(r + 24, r + 24, zero);
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SUBS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 32;
    sp_digit z1[32];
    sp_digit* a1 = z1;
    sp_digit zero[16];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 16);

    mask = sp_2048_sub_16(a1, a, &a[16]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_2048_sub_16(a1, p1, p2);

    sp_2048_sqr_16(z2, &a[16]);
    sp_2048_sqr_16(z0, a);
    sp_2048_sqr_16(z1, a1);

    u = 0;
    u -= sp_2048_sub_in_place_32(z1, z2);
    u -= sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_sub_in_place_32(r + 16, z1);
    zero[0] = u;
    (void)sp_2048_add_16(r + 48, r + 48, zero);
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SUBS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 64;
    sp_digit z1[64];
    sp_digit* a1 = z1;
    sp_digit zero[32];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 32);

    mask = sp_2048_sub_32(a1, a, &a[32]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_2048_sub_32(a1, p1, p2);

    sp_2048_sqr_32(z2, &a[32]);
    sp_2048_sqr_32(z0, a);
    sp_2048_sqr_32(z1, a1);

    u = 0;
    u -= sp_2048_sub_in_place_64(z1, z2);
    u -= sp_2048_sub_in_place_64(z1, z0);
    u += sp_2048_sub_in_place_64(r + 32, z1);
    zero[0] = u;
    (void)sp_2048_add_32(r + 96, r + 96, zero);
}

#endif /* !WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r3, #0x0\n\t"
        "ADD	r12, %[a], #0x100\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_add_64_word:\n\t"
#else
    "L_sp_2048_add_64_word_%=:\n\t"
#endif
        "ADDS	r3, r3, #0xffffffff\n\t"
        "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "ADCS	r7, r7, r11\n\t"
        "STM	%[r]!, {r4, r5, r6, r7}\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r3, r4, #0x0\n\t"
        "CMP	%[a], r12\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_2048_add_64_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_2048_add_64_word\n\t"
#else
        "BNE.N	L_sp_2048_add_64_word_%=\n\t"
#endif
        "MOV	%[r], r3\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_in_place_64(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r10, #0x0\n\t"
        "ADD	r11, %[a], #0x100\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sub_in_pkace_64_word:\n\t"
#else
    "L_sp_2048_sub_in_pkace_64_word_%=:\n\t"
#endif
        "RSBS	r10, r10, #0x0\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	r10, r10, r10\n\t"
        "CMP	%[a], r11\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_2048_sub_in_pkace_64_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_2048_sub_in_pkace_64_word\n\t"
#else
        "BNE.N	L_sp_2048_sub_in_pkace_64_word_%=\n\t"
#endif
        "MOV	%[a], r10\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
    );
    return (word32)(size_t)a;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x200\n\t"
        "LDR	lr, [%[a]]\n\t"
        "LDR	r11, [%[b]]\n\t"
        "UMULL	r8, r6, lr, r11\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_64_outer:\n\t"
#else
    "L_sp_2048_mul_64_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0xfc\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_64_inner:\n\t"
#else
    "L_sp_2048_mul_64_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "LDR	lr, [%[a], r4]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_2048_mul_64_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_2048_mul_64_inner_done\n\t"
#else
        "BGT.N	L_sp_2048_mul_64_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_2048_mul_64_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mul_64_inner\n\t"
#else
        "BLT.N	L_sp_2048_mul_64_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_64_inner_done:\n\t"
#else
    "L_sp_2048_mul_64_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x1f4\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_2048_mul_64_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_2048_mul_64_outer\n\t"
#else
        "BLE.N	L_sp_2048_mul_64_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #252]\n\t"
        "LDR	r11, [%[b], #252]\n\t"
        "UMLAL	r6, r7, lr, r11\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_64_store:\n\t"
#else
    "L_sp_2048_mul_64_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_2048_mul_64_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_2048_mul_64_store\n\t"
#else
        "BGT.N	L_sp_2048_mul_64_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p)
#else
static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x200\n\t"
        "LDR	lr, [%[a]]\n\t"
        "UMULL	r8, r6, lr, lr\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sqr_64_outer:\n\t"
#else
    "L_sp_2048_sqr_64_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0xfc\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sqr_64_inner:\n\t"
#else
    "L_sp_2048_sqr_64_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[a], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_2048_sqr_64_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_2048_sqr_64_inner_done\n\t"
#else
        "BGT.N	L_sp_2048_sqr_64_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_2048_sqr_64_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_sqr_64_inner\n\t"
#else
        "BLT.N	L_sp_2048_sqr_64_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "UMULL	r9, r10, lr, lr\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sqr_64_inner_done:\n\t"
#else
    "L_sp_2048_sqr_64_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x1f4\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_2048_sqr_64_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_2048_sqr_64_outer\n\t"
#else
        "BLE.N	L_sp_2048_sqr_64_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #252]\n\t"
        "UMLAL	r6, r7, lr, lr\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sqr_64_store:\n\t"
#else
    "L_sp_2048_sqr_64_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_2048_sqr_64_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_2048_sqr_64_store\n\t"
#else
        "BGT.N	L_sp_2048_sqr_64_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#ifdef WOLFSSL_SP_SMALL
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
{
    int i;

    for (i=0; i<32; i++) {
        r[i] = a[i] & m;
    }
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r3, #0x0\n\t"
        "ADD	r12, %[a], #0x80\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_add_32_word:\n\t"
#else
    "L_sp_2048_add_32_word_%=:\n\t"
#endif
        "ADDS	r3, r3, #0xffffffff\n\t"
        "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "ADCS	r7, r7, r11\n\t"
        "STM	%[r]!, {r4, r5, r6, r7}\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r3, r4, #0x0\n\t"
        "CMP	%[a], r12\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_2048_add_32_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_2048_add_32_word\n\t"
#else
        "BNE.N	L_sp_2048_add_32_word_%=\n\t"
#endif
        "MOV	%[r], r3\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r10, #0x0\n\t"
        "ADD	r11, %[a], #0x80\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sub_in_pkace_32_word:\n\t"
#else
    "L_sp_2048_sub_in_pkace_32_word_%=:\n\t"
#endif
        "RSBS	r10, r10, #0x0\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	r10, r10, r10\n\t"
        "CMP	%[a], r11\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_2048_sub_in_pkace_32_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_2048_sub_in_pkace_32_word\n\t"
#else
        "BNE.N	L_sp_2048_sub_in_pkace_32_word_%=\n\t"
#endif
        "MOV	%[a], r10\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
    );
    return (word32)(size_t)a;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x100\n\t"
        "LDR	lr, [%[a]]\n\t"
        "LDR	r11, [%[b]]\n\t"
        "UMULL	r8, r6, lr, r11\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_32_outer:\n\t"
#else
    "L_sp_2048_mul_32_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0x7c\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_32_inner:\n\t"
#else
    "L_sp_2048_mul_32_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "LDR	lr, [%[a], r4]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_2048_mul_32_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_2048_mul_32_inner_done\n\t"
#else
        "BGT.N	L_sp_2048_mul_32_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_2048_mul_32_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mul_32_inner\n\t"
#else
        "BLT.N	L_sp_2048_mul_32_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_32_inner_done:\n\t"
#else
    "L_sp_2048_mul_32_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0xf4\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_2048_mul_32_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_2048_mul_32_outer\n\t"
#else
        "BLE.N	L_sp_2048_mul_32_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #124]\n\t"
        "LDR	r11, [%[b], #124]\n\t"
        "UMLAL	r6, r7, lr, r11\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_32_store:\n\t"
#else
    "L_sp_2048_mul_32_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_2048_mul_32_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_2048_mul_32_store\n\t"
#else
        "BGT.N	L_sp_2048_mul_32_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p)
#else
static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x100\n\t"
        "LDR	lr, [%[a]]\n\t"
        "UMULL	r8, r6, lr, lr\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sqr_32_outer:\n\t"
#else
    "L_sp_2048_sqr_32_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0x7c\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sqr_32_inner:\n\t"
#else
    "L_sp_2048_sqr_32_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[a], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_2048_sqr_32_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_2048_sqr_32_inner_done\n\t"
#else
        "BGT.N	L_sp_2048_sqr_32_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_2048_sqr_32_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_sqr_32_inner\n\t"
#else
        "BLT.N	L_sp_2048_sqr_32_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "UMULL	r9, r10, lr, lr\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sqr_32_inner_done:\n\t"
#else
    "L_sp_2048_sqr_32_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0xf4\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_2048_sqr_32_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_2048_sqr_32_outer\n\t"
#else
        "BLE.N	L_sp_2048_sqr_32_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #124]\n\t"
        "UMLAL	r6, r7, lr, lr\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sqr_32_store:\n\t"
#else
    "L_sp_2048_sqr_32_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_2048_sqr_32_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_2048_sqr_32_store\n\t"
#else
        "BGT.N	L_sp_2048_sqr_32_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

/* Calculate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */

    /* rho = -1/m mod b */
    *rho = (sp_digit)0 - x;
}

#ifdef WOLFSSL_SP_SMALL
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDR	r8, [%[a]]\n\t"
        "UMULL	r5, r3, %[b], r8\n\t"
        "MOV	r4, #0x0\n\t"
        "STR	r5, [%[r]]\n\t"
        "MOV	r5, #0x0\n\t"
        "MOV	r9, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_d_64_word:\n\t"
#else
    "L_sp_2048_mul_d_64_word_%=:\n\t"
#endif
        /* A[i] * B */
        "LDR	r8, [%[a], r9]\n\t"
        "UMULL	r6, r7, %[b], r8\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [%[r], r9]\n\t"
        "MOV	r3, r4\n\t"
        "MOV	r4, r5\n\t"
        "MOV	r5, #0x0\n\t"
        "ADD	r9, r9, #0x4\n\t"
        "CMP	r9, #0x100\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mul_d_64_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mul_d_64_word\n\t"
#else
        "BLT.N	L_sp_2048_mul_d_64_word_%=\n\t"
#endif
        "STR	r3, [%[r], #256]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
}

#else
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMULL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[1] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[2] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[3] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[4] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[5] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[6] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[7] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[8] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[9] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[10] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[11] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[12] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[13] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[14] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[15] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[16] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[17] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[18] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[19] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[20] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[21] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[22] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[23] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[24] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[25] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[26] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[27] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[28] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[29] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[30] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[31] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[32] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[33] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[34] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[35] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[36] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[37] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[38] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[39] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[40] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[41] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[42] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[43] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[44] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[45] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[46] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[47] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[48] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[49] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[50] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[51] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[52] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[53] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[54] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[55] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[56] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[57] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[58] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[59] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[60] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[61] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[62] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[63] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "STR	r4, [%[r]]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 32);

    /* r = 2^n mod m */
    sp_2048_sub_in_place_32(r, m);
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r8, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_cond_sub_32_words:\n\t"
#else
    "L_sp_2048_cond_sub_32_words_%=:\n\t"
#endif
        "SUBS	r4, r8, r4\n\t"
        "LDR	r6, [%[a], r5]\n\t"
        "LDR	r7, [%[b], r5]\n\t"
        "AND	r7, r7, %[m]\n\t"
        "SBCS	r6, r6, r7\n\t"
        "SBC	r4, r8, r8\n\t"
        "STR	r6, [%[r], r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x80\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_cond_sub_32_words_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_cond_sub_32_words\n\t"
#else
        "BLT.N	L_sp_2048_cond_sub_32_words_%=\n\t"
#endif
        "MOV	%[r], r4\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r5, #0x0\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SUBS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "SBC	%[r], r5, r5\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_NO_UMAAL
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	lr, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r11, #0x0\n\t"
        "MOV	r3, #0x0\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[a], #4]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_32_word:\n\t"
#else
    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	r10, %[mp], r4\n\t"
        /* a[i+0] += m[0] * mu */
        "MOV	r7, #0x0\n\t"
        "UMLAL	r4, r7, r10, lr\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r9, [%[m], #4]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r5, r6, r10, r9\n\t"
        "MOV	r4, r5\n\t"
        "ADDS	r4, r4, r7\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r9, [%[m], #8]\n\t"
        "LDR	r5, [%[a], #8]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r5, r7, r10, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r9, [%[m], #12]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #12]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r9, [%[m], #16]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #16]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r9, [%[m], #20]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #20]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r9, [%[m], #24]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #24]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r9, [%[m], #28]\n\t"
        "LDR	r12, [%[a], #28]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #28]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r9, [%[m], #32]\n\t"
        "LDR	r12, [%[a], #32]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #32]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r9, [%[m], #36]\n\t"
        "LDR	r12, [%[a], #36]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #36]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r9, [%[m], #40]\n\t"
        "LDR	r12, [%[a], #40]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #40]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r9, [%[m], #44]\n\t"
        "LDR	r12, [%[a], #44]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #44]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r9, [%[m], #48]\n\t"
        "LDR	r12, [%[a], #48]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #48]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r9, [%[m], #52]\n\t"
        "LDR	r12, [%[a], #52]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #52]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r9, [%[m], #56]\n\t"
        "LDR	r12, [%[a], #56]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #56]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r9, [%[m], #60]\n\t"
        "LDR	r12, [%[a], #60]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #60]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r9, [%[m], #64]\n\t"
        "LDR	r12, [%[a], #64]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #64]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r9, [%[m], #68]\n\t"
        "LDR	r12, [%[a], #68]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #68]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r9, [%[m], #72]\n\t"
        "LDR	r12, [%[a], #72]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #72]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r9, [%[m], #76]\n\t"
        "LDR	r12, [%[a], #76]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #76]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r9, [%[m], #80]\n\t"
        "LDR	r12, [%[a], #80]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #80]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r9, [%[m], #84]\n\t"
        "LDR	r12, [%[a], #84]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #84]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r9, [%[m], #88]\n\t"
        "LDR	r12, [%[a], #88]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #88]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r9, [%[m], #92]\n\t"
        "LDR	r12, [%[a], #92]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #92]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r9, [%[m], #96]\n\t"
        "LDR	r12, [%[a], #96]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #96]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r9, [%[m], #100]\n\t"
        "LDR	r12, [%[a], #100]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #100]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r9, [%[m], #104]\n\t"
        "LDR	r12, [%[a], #104]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #104]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r9, [%[m], #108]\n\t"
        "LDR	r12, [%[a], #108]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #108]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r9, [%[m], #112]\n\t"
        "LDR	r12, [%[a], #112]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #112]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r9, [%[m], #116]\n\t"
        "LDR	r12, [%[a], #116]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #116]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r9, [%[m], #120]\n\t"
        "LDR	r12, [%[a], #120]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #120]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r9, [%[m], #124]\n\t"
        "LDR	r12, [%[a], #124]\n\t"
        "UMULL	r8, r9, r10, r9\n\t"
        "ADDS	r7, r7, r8\n\t"
        "ADCS	r6, r9, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, r3\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #124]\n\t"
        "LDR	r12, [%[a], #128]\n\t"
        "ADCS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #128]\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* i += 1 */
        "ADD	r11, r11, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r11, #0x80\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_32_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_2048_mont_reduce_32_word\n\t"
#else
        "BLT.W	L_sp_2048_mont_reduce_32_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r4, [%[a]]\n\t"
        "STR	r5, [%[a], #4]\n\t"
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_32_word:\n\t"
#else
    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_32_mul:\n\t"
#else
    "L_sp_2048_mont_reduce_32_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0x80\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_32_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mont_reduce_32_mul\n\t"
#else
        "BLT.N	L_sp_2048_mont_reduce_32_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #128]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #128]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0x80\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_32_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mont_reduce_32_word\n\t"
#else
        "BLT.N	L_sp_2048_mont_reduce_32_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#else
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* i = 0 */
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "LDR	r6, [%[a]]\n\t"
        "LDR	r7, [%[a], #4]\n\t"
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[a], #12]\n\t"
        "LDR	r10, [%[a], #16]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_32_word:\n\t"
#else
    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	lr, %[mp], r6\n\t"
        /* a[i+0] += m[0] * mu */
        "LDR	r12, [%[m]]\n\t"
        "MOV	r3, #0x0\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r12, [%[m], #4]\n\t"
        "MOV	r6, r7\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r12, [%[m], #8]\n\t"
        "MOV	r7, r8\n\t"
        "UMAAL	r7, r3, lr, r12\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r12, [%[m], #12]\n\t"
        "MOV	r8, r9\n\t"
        "UMAAL	r8, r3, lr, r12\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r12, [%[m], #16]\n\t"
        "MOV	r9, r10\n\t"
        "UMAAL	r9, r3, lr, r12\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r12, [%[m], #20]\n\t"
        "LDR	r10, [%[a], #20]\n\t"
        "UMAAL	r10, r3, lr, r12\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r12, [%[m], #24]\n\t"
        "LDR	r11, [%[a], #24]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #24]\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r12, [%[m], #28]\n\t"
        "LDR	r11, [%[a], #28]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #28]\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r12, [%[m], #32]\n\t"
        "LDR	r11, [%[a], #32]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #32]\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r12, [%[m], #36]\n\t"
        "LDR	r11, [%[a], #36]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #36]\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r12, [%[m], #40]\n\t"
        "LDR	r11, [%[a], #40]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #40]\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r12, [%[m], #44]\n\t"
        "LDR	r11, [%[a], #44]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #44]\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r12, [%[m], #48]\n\t"
        "LDR	r11, [%[a], #48]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #48]\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r12, [%[m], #52]\n\t"
        "LDR	r11, [%[a], #52]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #52]\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r12, [%[m], #56]\n\t"
        "LDR	r11, [%[a], #56]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #56]\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r12, [%[m], #60]\n\t"
        "LDR	r11, [%[a], #60]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #60]\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r12, [%[m], #64]\n\t"
        "LDR	r11, [%[a], #64]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #64]\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r12, [%[m], #68]\n\t"
        "LDR	r11, [%[a], #68]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #68]\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r12, [%[m], #72]\n\t"
        "LDR	r11, [%[a], #72]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #72]\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r12, [%[m], #76]\n\t"
        "LDR	r11, [%[a], #76]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #76]\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r12, [%[m], #80]\n\t"
        "LDR	r11, [%[a], #80]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #80]\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r12, [%[m], #84]\n\t"
        "LDR	r11, [%[a], #84]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #84]\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r12, [%[m], #88]\n\t"
        "LDR	r11, [%[a], #88]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #88]\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r12, [%[m], #92]\n\t"
        "LDR	r11, [%[a], #92]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #92]\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r12, [%[m], #96]\n\t"
        "LDR	r11, [%[a], #96]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #96]\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r12, [%[m], #100]\n\t"
        "LDR	r11, [%[a], #100]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #100]\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r12, [%[m], #104]\n\t"
        "LDR	r11, [%[a], #104]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #104]\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r12, [%[m], #108]\n\t"
        "LDR	r11, [%[a], #108]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #108]\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r12, [%[m], #112]\n\t"
        "LDR	r11, [%[a], #112]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #112]\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r12, [%[m], #116]\n\t"
        "LDR	r11, [%[a], #116]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #116]\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r12, [%[m], #120]\n\t"
        "LDR	r11, [%[a], #120]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #120]\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r12, [%[m], #124]\n\t"
        "LDR	r11, [%[a], #124]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "LDR	lr, [%[a], #128]\n\t"
        "MOV	r12, #0x0\n\t"
        "UMAAL	r3, lr, r12, r12\n\t"
        "STR	r11, [%[a], #124]\n\t"
        "ADDS	r3, r3, r5\n\t"
        "ADC	r5, lr, #0x0\n\t"
        "STR	r3, [%[a], #128]\n\t"
        /* i += 1 */
        "ADD	r4, r4, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r4, #0x80\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_32_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_2048_mont_reduce_32_word\n\t"
#else
        "BLT.W	L_sp_2048_mont_reduce_32_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r6, [%[a]]\n\t"
        "STR	r7, [%[a], #4]\n\t"
        "STR	r8, [%[a], #8]\n\t"
        "STR	r9, [%[a], #12]\n\t"
        "STR	r10, [%[a], #16]\n\t"
        "MOV	%[mp], r5\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_32_word:\n\t"
#else
    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_32_mul:\n\t"
#else
    "L_sp_2048_mont_reduce_32_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0x80\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_32_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mont_reduce_32_mul\n\t"
#else
        "BLT.N	L_sp_2048_mont_reduce_32_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #128]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #128]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0x80\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_32_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mont_reduce_32_word\n\t"
#else
        "BLT.N	L_sp_2048_mont_reduce_32_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#endif
/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_2048_mul_32(r, a, b);
    sp_2048_mont_reduce_32(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_2048_sqr_32(r, a);
    sp_2048_mont_reduce_32(r, m, mp);
}

#ifdef WOLFSSL_SP_SMALL
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDR	r8, [%[a]]\n\t"
        "UMULL	r5, r3, %[b], r8\n\t"
        "MOV	r4, #0x0\n\t"
        "STR	r5, [%[r]]\n\t"
        "MOV	r5, #0x0\n\t"
        "MOV	r9, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mul_d_32_word:\n\t"
#else
    "L_sp_2048_mul_d_32_word_%=:\n\t"
#endif
        /* A[i] * B */
        "LDR	r8, [%[a], r9]\n\t"
        "UMULL	r6, r7, %[b], r8\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [%[r], r9]\n\t"
        "MOV	r3, r4\n\t"
        "MOV	r4, r5\n\t"
        "MOV	r5, #0x0\n\t"
        "ADD	r9, r9, #0x4\n\t"
        "CMP	r9, #0x80\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mul_d_32_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mul_d_32_word\n\t"
#else
        "BLT.N	L_sp_2048_mul_d_32_word_%=\n\t"
#endif
        "STR	r3, [%[r], #128]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
}

#else
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMULL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[1] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[2] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[3] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[4] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[5] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[6] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[7] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[8] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[9] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[10] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[11] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[12] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[13] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[14] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[15] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[16] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[17] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[18] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[19] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[20] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[21] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[22] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[23] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[24] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[25] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[26] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[27] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[28] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[29] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[30] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[31] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "STR	r5, [%[r]]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_USE_UDIV
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r8, %[div], #16\n\t"
        "ADD	r5, r8, #0x1\n\t"
        "UDIV	r6, %[d1], r5\n\t"
        "LSL	r7, %[div], #16\n\t"
        "LSL	r6, r6, #16\n\t"
        "UMULL	r3, r4, %[div], r6\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "SUBS	r3, %[d1], r5\n\t"
        "SBC	r9, r9, r9\n\t"
        "ADD	r9, r9, #0x1\n\t"
        "RSB	r10, r9, #0x0\n\t"
        "LSL	r9, r9, #16\n\t"
        "AND	r7, r7, r10\n\t"
        "AND	r8, r8, r10\n\t"
        "SUBS	%[d0], %[d0], r7\n\t"
        "ADD	r6, r6, r9\n\t"
        "SBC	%[d1], %[d1], r8\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "UMULL	r3, r4, %[div], r3\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "MUL	r3, %[div], r3\n\t"
        "SUB	%[d0], %[d0], r3\n\t"
        "UDIV	r3, %[d0], %[div]\n\t"
        "ADD	%[d1], r6, r3\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#else
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r5, %[div], #1\n\t"
        "ADD	r5, r5, #0x1\n\t"
        "MOV	r6, %[d0]\n\t"
        "MOV	r7, %[d1]\n\t"
        /* Do top 32 */
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "MOV	r3, #0x0\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        /* Next 30 bits */
        "MOV	r4, #0x1d\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_div_2048_word_32_bit:\n\t"
#else
    "L_div_2048_word_32_bit_%=:\n\t"
#endif
        "LSLS	r6, r6, #1\n\t"
        "ADC	r7, r7, r7\n\t"
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "ADD	r3, r3, r3\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        "SUBS	r4, r4, #0x1\n\t"
#if defined(__GNUC__)
        "BPL	L_div_2048_word_32_bit_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BPL.N	L_div_2048_word_32_bit\n\t"
#else
        "BPL.N	L_div_2048_word_32_bit_%=\n\t"
#endif
        "ADD	r3, r3, r3\n\t"
        "ADD	r3, r3, #0x1\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "SUBS	r8, %[div], r9\n\t"
        "SBC	r8, r8, r8\n\t"
        "SUB	%[d1], r3, r8\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#endif
/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_int32 sp_2048_cmp_32(const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r2, #0xffffffff\n\t"
        "MOV	r8, #0x1\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r3, #0xffffffff\n\t"
#ifdef WOLFSSL_SP_SMALL
        "MOV	r6, #0x7c\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_cmp_32_words:\n\t"
#else
    "L_sp_2048_cmp_32_words_%=:\n\t"
#endif
        "LDR	r4, [%[a], r6]\n\t"
        "LDR	r5, [%[b], r6]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "SUBS	r6, r6, #0x4\n\t"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "bcs	L_sp_2048_cmp_32_words\n\t"
#else
        "bcs	L_sp_2048_cmp_32_words_%=\n\t"
#endif
        "EOR	r2, r2, r3\n\t"
#else
        "LDR	r4, [%[a], #124]\n\t"
        "LDR	r5, [%[b], #124]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #120]\n\t"
        "LDR	r5, [%[b], #120]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #116]\n\t"
        "LDR	r5, [%[b], #116]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #112]\n\t"
        "LDR	r5, [%[b], #112]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #108]\n\t"
        "LDR	r5, [%[b], #108]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #104]\n\t"
        "LDR	r5, [%[b], #104]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #100]\n\t"
        "LDR	r5, [%[b], #100]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #96]\n\t"
        "LDR	r5, [%[b], #96]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #92]\n\t"
        "LDR	r5, [%[b], #92]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #88]\n\t"
        "LDR	r5, [%[b], #88]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #84]\n\t"
        "LDR	r5, [%[b], #84]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #80]\n\t"
        "LDR	r5, [%[b], #80]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #76]\n\t"
        "LDR	r5, [%[b], #76]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #72]\n\t"
        "LDR	r5, [%[b], #72]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #68]\n\t"
        "LDR	r5, [%[b], #68]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #64]\n\t"
        "LDR	r5, [%[b], #64]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #60]\n\t"
        "LDR	r5, [%[b], #60]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #56]\n\t"
        "LDR	r5, [%[b], #56]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #52]\n\t"
        "LDR	r5, [%[b], #52]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #48]\n\t"
        "LDR	r5, [%[b], #48]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #44]\n\t"
        "LDR	r5, [%[b], #44]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #40]\n\t"
        "LDR	r5, [%[b], #40]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #36]\n\t"
        "LDR	r5, [%[b], #36]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #32]\n\t"
        "LDR	r5, [%[b], #32]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #28]\n\t"
        "LDR	r5, [%[b], #28]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #24]\n\t"
        "LDR	r5, [%[b], #24]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #20]\n\t"
        "LDR	r5, [%[b], #20]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #16]\n\t"
        "LDR	r5, [%[b], #16]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #12]\n\t"
        "LDR	r5, [%[b], #12]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #8]\n\t"
        "LDR	r5, [%[b], #8]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #4]\n\t"
        "LDR	r5, [%[b], #4]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[b]]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "EOR	r2, r2, r3\n\t"
#endif /*WOLFSSL_SP_SMALL */
        "MOV	%[a], r2\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)a;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d,
        sp_digit* m, sp_digit* r)
{
    sp_digit t1[64], t2[33];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
    r1 = sp_2048_cmp_32(&t1[32], d) >= 0;
    sp_2048_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1);
    for (i = 31; i >= 0; i--) {
        volatile sp_digit mask = (sp_digit)0 - (t1[32 + i] == div);
        sp_digit hi = t1[32 + i] + mask;
        r1 = div_2048_word_32(hi, t1[32 + i - 1], div);
        r1 |= mask;

        sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
        t1[32 + i] -= t2[32];
        sp_2048_mask_32(t2, d, t1[32 + i]);
        t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], t2);
        sp_2048_mask_32(t2, d, t1[32 + i]);
        t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], t2);
    }

    r1 = sp_2048_cmp_32(t1, d) >= 0;
    sp_2048_cond_sub_32(r, t1, d, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_mod_32(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_2048_div_32(a, m, NULL, r);
}

#ifdef WOLFSSL_SP_SMALL
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[16 * 64];
#endif
    sp_digit* t[16];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (16 * 64), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++) {
            t[i] = td + i * 64;
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_32(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 32U);
        if (reduceA != 0) {
            err = sp_2048_mod_32(t[1] + 32, a, m);
            if (err == MP_OKAY) {
                err = sp_2048_mod_32(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_32(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_32(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_32(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_32(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_32(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_32(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_32(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_32(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_32(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_32(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_32(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_32(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 4;
        if (c == 32) {
            c = 28;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
        for (; i>=0 || c>=4; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 28);
                n <<= 4;
                c = 28;
            }
            else if (c < 4) {
                y = (byte)(n >> 28);
                n = e[i--];
                c = 4 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }

            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);

            sp_2048_mont_mul_32(r, r, t[y], m, mp);
        }

        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32U);
        sp_2048_mont_reduce_32(r, m, mp);

        mask = (sp_digit)0 - (sp_2048_cmp_32(r, m) >= 0);
        sp_2048_cond_sub_32(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#else
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[32 * 64];
#endif
    sp_digit* t[32];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 64), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<32; i++) {
            t[i] = td + i * 64;
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_32(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 32U);
        if (reduceA != 0) {
            err = sp_2048_mod_32(t[1] + 32, a, m);
            if (err == MP_OKAY) {
                err = sp_2048_mod_32(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_32(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_32(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_32(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_32(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_32(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_32(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_32(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_32(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_32(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_32(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_32(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_32(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_32(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_32(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_32(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_32(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_32(t[20], t[10], m, mp);
        sp_2048_mont_mul_32(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_32(t[22], t[11], m, mp);
        sp_2048_mont_mul_32(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_32(t[24], t[12], m, mp);
        sp_2048_mont_mul_32(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_32(t[26], t[13], m, mp);
        sp_2048_mont_mul_32(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_32(t[28], t[14], m, mp);
        sp_2048_mont_mul_32(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_32(t[30], t[15], m, mp);
        sp_2048_mont_mul_32(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 5;
        if (c == 32) {
            c = 27;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 27);
                n <<= 5;
                c = 27;
            }
            else if (c < 5) {
                y = (byte)(n >> 27);
                n = e[i--];
                c = 5 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);

            sp_2048_mont_mul_32(r, r, t[y], m, mp);
        }

        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32U);
        sp_2048_mont_reduce_32(r, m, mp);

        mask = (sp_digit)0 - (sp_2048_cmp_32(r, m) >= 0);
        sp_2048_cond_sub_32(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* WOLFSSL_SP_SMALL */

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 64);

    /* r = 2^n mod m */
    sp_2048_sub_in_place_64(r, m);
}

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_SP_SMALL
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r8, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_cond_sub_64_words:\n\t"
#else
    "L_sp_2048_cond_sub_64_words_%=:\n\t"
#endif
        "SUBS	r4, r8, r4\n\t"
        "LDR	r6, [%[a], r5]\n\t"
        "LDR	r7, [%[b], r5]\n\t"
        "AND	r7, r7, %[m]\n\t"
        "SBCS	r6, r6, r7\n\t"
        "SBC	r4, r8, r8\n\t"
        "STR	r6, [%[r], r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x100\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_cond_sub_64_words_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_cond_sub_64_words\n\t"
#else
        "BLT.N	L_sp_2048_cond_sub_64_words_%=\n\t"
#endif
        "MOV	%[r], r4\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r5, #0x0\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SUBS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "SBC	%[r], r5, r5\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_NO_UMAAL
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	lr, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r11, #0x0\n\t"
        "MOV	r3, #0x0\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[a], #4]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_64_word:\n\t"
#else
    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	r10, %[mp], r4\n\t"
        /* a[i+0] += m[0] * mu */
        "MOV	r7, #0x0\n\t"
        "UMLAL	r4, r7, r10, lr\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r9, [%[m], #4]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r5, r6, r10, r9\n\t"
        "MOV	r4, r5\n\t"
        "ADDS	r4, r4, r7\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r9, [%[m], #8]\n\t"
        "LDR	r5, [%[a], #8]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r5, r7, r10, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r9, [%[m], #12]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #12]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r9, [%[m], #16]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #16]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r9, [%[m], #20]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #20]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r9, [%[m], #24]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #24]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r9, [%[m], #28]\n\t"
        "LDR	r12, [%[a], #28]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #28]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r9, [%[m], #32]\n\t"
        "LDR	r12, [%[a], #32]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #32]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r9, [%[m], #36]\n\t"
        "LDR	r12, [%[a], #36]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #36]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r9, [%[m], #40]\n\t"
        "LDR	r12, [%[a], #40]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #40]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r9, [%[m], #44]\n\t"
        "LDR	r12, [%[a], #44]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #44]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r9, [%[m], #48]\n\t"
        "LDR	r12, [%[a], #48]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #48]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r9, [%[m], #52]\n\t"
        "LDR	r12, [%[a], #52]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #52]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r9, [%[m], #56]\n\t"
        "LDR	r12, [%[a], #56]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #56]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r9, [%[m], #60]\n\t"
        "LDR	r12, [%[a], #60]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #60]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r9, [%[m], #64]\n\t"
        "LDR	r12, [%[a], #64]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #64]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r9, [%[m], #68]\n\t"
        "LDR	r12, [%[a], #68]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #68]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r9, [%[m], #72]\n\t"
        "LDR	r12, [%[a], #72]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #72]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r9, [%[m], #76]\n\t"
        "LDR	r12, [%[a], #76]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #76]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r9, [%[m], #80]\n\t"
        "LDR	r12, [%[a], #80]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #80]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r9, [%[m], #84]\n\t"
        "LDR	r12, [%[a], #84]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #84]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r9, [%[m], #88]\n\t"
        "LDR	r12, [%[a], #88]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #88]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r9, [%[m], #92]\n\t"
        "LDR	r12, [%[a], #92]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #92]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r9, [%[m], #96]\n\t"
        "LDR	r12, [%[a], #96]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #96]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r9, [%[m], #100]\n\t"
        "LDR	r12, [%[a], #100]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #100]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r9, [%[m], #104]\n\t"
        "LDR	r12, [%[a], #104]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #104]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r9, [%[m], #108]\n\t"
        "LDR	r12, [%[a], #108]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #108]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r9, [%[m], #112]\n\t"
        "LDR	r12, [%[a], #112]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #112]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r9, [%[m], #116]\n\t"
        "LDR	r12, [%[a], #116]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #116]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r9, [%[m], #120]\n\t"
        "LDR	r12, [%[a], #120]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #120]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r9, [%[m], #124]\n\t"
        "LDR	r12, [%[a], #124]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #124]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+32] += m[32] * mu */
        "LDR	r9, [%[m], #128]\n\t"
        "LDR	r12, [%[a], #128]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #128]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+33] += m[33] * mu */
        "LDR	r9, [%[m], #132]\n\t"
        "LDR	r12, [%[a], #132]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #132]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+34] += m[34] * mu */
        "LDR	r9, [%[m], #136]\n\t"
        "LDR	r12, [%[a], #136]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #136]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+35] += m[35] * mu */
        "LDR	r9, [%[m], #140]\n\t"
        "LDR	r12, [%[a], #140]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #140]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+36] += m[36] * mu */
        "LDR	r9, [%[m], #144]\n\t"
        "LDR	r12, [%[a], #144]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #144]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+37] += m[37] * mu */
        "LDR	r9, [%[m], #148]\n\t"
        "LDR	r12, [%[a], #148]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #148]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+38] += m[38] * mu */
        "LDR	r9, [%[m], #152]\n\t"
        "LDR	r12, [%[a], #152]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #152]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+39] += m[39] * mu */
        "LDR	r9, [%[m], #156]\n\t"
        "LDR	r12, [%[a], #156]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #156]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+40] += m[40] * mu */
        "LDR	r9, [%[m], #160]\n\t"
        "LDR	r12, [%[a], #160]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #160]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+41] += m[41] * mu */
        "LDR	r9, [%[m], #164]\n\t"
        "LDR	r12, [%[a], #164]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #164]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+42] += m[42] * mu */
        "LDR	r9, [%[m], #168]\n\t"
        "LDR	r12, [%[a], #168]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #168]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+43] += m[43] * mu */
        "LDR	r9, [%[m], #172]\n\t"
        "LDR	r12, [%[a], #172]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #172]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+44] += m[44] * mu */
        "LDR	r9, [%[m], #176]\n\t"
        "LDR	r12, [%[a], #176]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #176]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+45] += m[45] * mu */
        "LDR	r9, [%[m], #180]\n\t"
        "LDR	r12, [%[a], #180]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #180]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+46] += m[46] * mu */
        "LDR	r9, [%[m], #184]\n\t"
        "LDR	r12, [%[a], #184]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #184]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+47] += m[47] * mu */
        "LDR	r9, [%[m], #188]\n\t"
        "LDR	r12, [%[a], #188]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #188]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+48] += m[48] * mu */
        "LDR	r9, [%[m], #192]\n\t"
        "LDR	r12, [%[a], #192]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #192]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+49] += m[49] * mu */
        "LDR	r9, [%[m], #196]\n\t"
        "LDR	r12, [%[a], #196]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #196]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+50] += m[50] * mu */
        "LDR	r9, [%[m], #200]\n\t"
        "LDR	r12, [%[a], #200]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #200]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+51] += m[51] * mu */
        "LDR	r9, [%[m], #204]\n\t"
        "LDR	r12, [%[a], #204]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #204]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+52] += m[52] * mu */
        "LDR	r9, [%[m], #208]\n\t"
        "LDR	r12, [%[a], #208]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #208]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+53] += m[53] * mu */
        "LDR	r9, [%[m], #212]\n\t"
        "LDR	r12, [%[a], #212]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #212]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+54] += m[54] * mu */
        "LDR	r9, [%[m], #216]\n\t"
        "LDR	r12, [%[a], #216]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #216]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+55] += m[55] * mu */
        "LDR	r9, [%[m], #220]\n\t"
        "LDR	r12, [%[a], #220]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #220]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+56] += m[56] * mu */
        "LDR	r9, [%[m], #224]\n\t"
        "LDR	r12, [%[a], #224]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #224]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+57] += m[57] * mu */
        "LDR	r9, [%[m], #228]\n\t"
        "LDR	r12, [%[a], #228]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #228]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+58] += m[58] * mu */
        "LDR	r9, [%[m], #232]\n\t"
        "LDR	r12, [%[a], #232]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #232]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+59] += m[59] * mu */
        "LDR	r9, [%[m], #236]\n\t"
        "LDR	r12, [%[a], #236]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #236]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+60] += m[60] * mu */
        "LDR	r9, [%[m], #240]\n\t"
        "LDR	r12, [%[a], #240]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #240]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+61] += m[61] * mu */
        "LDR	r9, [%[m], #244]\n\t"
        "LDR	r12, [%[a], #244]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #244]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+62] += m[62] * mu */
        "LDR	r9, [%[m], #248]\n\t"
        "LDR	r12, [%[a], #248]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #248]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+63] += m[63] * mu */
        "LDR	r9, [%[m], #252]\n\t"
        "LDR	r12, [%[a], #252]\n\t"
        "UMULL	r8, r9, r10, r9\n\t"
        "ADDS	r7, r7, r8\n\t"
        "ADCS	r6, r9, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, r3\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #252]\n\t"
        "LDR	r12, [%[a], #256]\n\t"
        "ADCS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #256]\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* i += 1 */
        "ADD	r11, r11, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r11, #0x100\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_64_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_2048_mont_reduce_64_word\n\t"
#else
        "BLT.W	L_sp_2048_mont_reduce_64_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r4, [%[a]]\n\t"
        "STR	r5, [%[a], #4]\n\t"
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_64_word:\n\t"
#else
    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_64_mul:\n\t"
#else
    "L_sp_2048_mont_reduce_64_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0x100\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_64_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mont_reduce_64_mul\n\t"
#else
        "BLT.N	L_sp_2048_mont_reduce_64_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #256]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #256]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0x100\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_64_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mont_reduce_64_word\n\t"
#else
        "BLT.N	L_sp_2048_mont_reduce_64_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#else
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* i = 0 */
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "LDR	r6, [%[a]]\n\t"
        "LDR	r7, [%[a], #4]\n\t"
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[a], #12]\n\t"
        "LDR	r10, [%[a], #16]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_64_word:\n\t"
#else
    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	lr, %[mp], r6\n\t"
        /* a[i+0] += m[0] * mu */
        "LDR	r12, [%[m]]\n\t"
        "MOV	r3, #0x0\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r12, [%[m], #4]\n\t"
        "MOV	r6, r7\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r12, [%[m], #8]\n\t"
        "MOV	r7, r8\n\t"
        "UMAAL	r7, r3, lr, r12\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r12, [%[m], #12]\n\t"
        "MOV	r8, r9\n\t"
        "UMAAL	r8, r3, lr, r12\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r12, [%[m], #16]\n\t"
        "MOV	r9, r10\n\t"
        "UMAAL	r9, r3, lr, r12\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r12, [%[m], #20]\n\t"
        "LDR	r10, [%[a], #20]\n\t"
        "UMAAL	r10, r3, lr, r12\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r12, [%[m], #24]\n\t"
        "LDR	r11, [%[a], #24]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #24]\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r12, [%[m], #28]\n\t"
        "LDR	r11, [%[a], #28]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #28]\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r12, [%[m], #32]\n\t"
        "LDR	r11, [%[a], #32]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #32]\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r12, [%[m], #36]\n\t"
        "LDR	r11, [%[a], #36]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #36]\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r12, [%[m], #40]\n\t"
        "LDR	r11, [%[a], #40]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #40]\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r12, [%[m], #44]\n\t"
        "LDR	r11, [%[a], #44]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #44]\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r12, [%[m], #48]\n\t"
        "LDR	r11, [%[a], #48]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #48]\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r12, [%[m], #52]\n\t"
        "LDR	r11, [%[a], #52]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #52]\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r12, [%[m], #56]\n\t"
        "LDR	r11, [%[a], #56]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #56]\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r12, [%[m], #60]\n\t"
        "LDR	r11, [%[a], #60]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #60]\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r12, [%[m], #64]\n\t"
        "LDR	r11, [%[a], #64]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #64]\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r12, [%[m], #68]\n\t"
        "LDR	r11, [%[a], #68]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #68]\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r12, [%[m], #72]\n\t"
        "LDR	r11, [%[a], #72]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #72]\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r12, [%[m], #76]\n\t"
        "LDR	r11, [%[a], #76]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #76]\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r12, [%[m], #80]\n\t"
        "LDR	r11, [%[a], #80]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #80]\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r12, [%[m], #84]\n\t"
        "LDR	r11, [%[a], #84]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #84]\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r12, [%[m], #88]\n\t"
        "LDR	r11, [%[a], #88]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #88]\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r12, [%[m], #92]\n\t"
        "LDR	r11, [%[a], #92]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #92]\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r12, [%[m], #96]\n\t"
        "LDR	r11, [%[a], #96]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #96]\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r12, [%[m], #100]\n\t"
        "LDR	r11, [%[a], #100]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #100]\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r12, [%[m], #104]\n\t"
        "LDR	r11, [%[a], #104]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #104]\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r12, [%[m], #108]\n\t"
        "LDR	r11, [%[a], #108]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #108]\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r12, [%[m], #112]\n\t"
        "LDR	r11, [%[a], #112]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #112]\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r12, [%[m], #116]\n\t"
        "LDR	r11, [%[a], #116]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #116]\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r12, [%[m], #120]\n\t"
        "LDR	r11, [%[a], #120]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #120]\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r12, [%[m], #124]\n\t"
        "LDR	r11, [%[a], #124]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #124]\n\t"
        /* a[i+32] += m[32] * mu */
        "LDR	r12, [%[m], #128]\n\t"
        "LDR	r11, [%[a], #128]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #128]\n\t"
        /* a[i+33] += m[33] * mu */
        "LDR	r12, [%[m], #132]\n\t"
        "LDR	r11, [%[a], #132]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #132]\n\t"
        /* a[i+34] += m[34] * mu */
        "LDR	r12, [%[m], #136]\n\t"
        "LDR	r11, [%[a], #136]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #136]\n\t"
        /* a[i+35] += m[35] * mu */
        "LDR	r12, [%[m], #140]\n\t"
        "LDR	r11, [%[a], #140]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #140]\n\t"
        /* a[i+36] += m[36] * mu */
        "LDR	r12, [%[m], #144]\n\t"
        "LDR	r11, [%[a], #144]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #144]\n\t"
        /* a[i+37] += m[37] * mu */
        "LDR	r12, [%[m], #148]\n\t"
        "LDR	r11, [%[a], #148]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #148]\n\t"
        /* a[i+38] += m[38] * mu */
        "LDR	r12, [%[m], #152]\n\t"
        "LDR	r11, [%[a], #152]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #152]\n\t"
        /* a[i+39] += m[39] * mu */
        "LDR	r12, [%[m], #156]\n\t"
        "LDR	r11, [%[a], #156]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #156]\n\t"
        /* a[i+40] += m[40] * mu */
        "LDR	r12, [%[m], #160]\n\t"
        "LDR	r11, [%[a], #160]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #160]\n\t"
        /* a[i+41] += m[41] * mu */
        "LDR	r12, [%[m], #164]\n\t"
        "LDR	r11, [%[a], #164]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #164]\n\t"
        /* a[i+42] += m[42] * mu */
        "LDR	r12, [%[m], #168]\n\t"
        "LDR	r11, [%[a], #168]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #168]\n\t"
        /* a[i+43] += m[43] * mu */
        "LDR	r12, [%[m], #172]\n\t"
        "LDR	r11, [%[a], #172]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #172]\n\t"
        /* a[i+44] += m[44] * mu */
        "LDR	r12, [%[m], #176]\n\t"
        "LDR	r11, [%[a], #176]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #176]\n\t"
        /* a[i+45] += m[45] * mu */
        "LDR	r12, [%[m], #180]\n\t"
        "LDR	r11, [%[a], #180]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #180]\n\t"
        /* a[i+46] += m[46] * mu */
        "LDR	r12, [%[m], #184]\n\t"
        "LDR	r11, [%[a], #184]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #184]\n\t"
        /* a[i+47] += m[47] * mu */
        "LDR	r12, [%[m], #188]\n\t"
        "LDR	r11, [%[a], #188]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #188]\n\t"
        /* a[i+48] += m[48] * mu */
        "LDR	r12, [%[m], #192]\n\t"
        "LDR	r11, [%[a], #192]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #192]\n\t"
        /* a[i+49] += m[49] * mu */
        "LDR	r12, [%[m], #196]\n\t"
        "LDR	r11, [%[a], #196]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #196]\n\t"
        /* a[i+50] += m[50] * mu */
        "LDR	r12, [%[m], #200]\n\t"
        "LDR	r11, [%[a], #200]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #200]\n\t"
        /* a[i+51] += m[51] * mu */
        "LDR	r12, [%[m], #204]\n\t"
        "LDR	r11, [%[a], #204]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #204]\n\t"
        /* a[i+52] += m[52] * mu */
        "LDR	r12, [%[m], #208]\n\t"
        "LDR	r11, [%[a], #208]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #208]\n\t"
        /* a[i+53] += m[53] * mu */
        "LDR	r12, [%[m], #212]\n\t"
        "LDR	r11, [%[a], #212]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #212]\n\t"
        /* a[i+54] += m[54] * mu */
        "LDR	r12, [%[m], #216]\n\t"
        "LDR	r11, [%[a], #216]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #216]\n\t"
        /* a[i+55] += m[55] * mu */
        "LDR	r12, [%[m], #220]\n\t"
        "LDR	r11, [%[a], #220]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #220]\n\t"
        /* a[i+56] += m[56] * mu */
        "LDR	r12, [%[m], #224]\n\t"
        "LDR	r11, [%[a], #224]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #224]\n\t"
        /* a[i+57] += m[57] * mu */
        "LDR	r12, [%[m], #228]\n\t"
        "LDR	r11, [%[a], #228]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #228]\n\t"
        /* a[i+58] += m[58] * mu */
        "LDR	r12, [%[m], #232]\n\t"
        "LDR	r11, [%[a], #232]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #232]\n\t"
        /* a[i+59] += m[59] * mu */
        "LDR	r12, [%[m], #236]\n\t"
        "LDR	r11, [%[a], #236]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #236]\n\t"
        /* a[i+60] += m[60] * mu */
        "LDR	r12, [%[m], #240]\n\t"
        "LDR	r11, [%[a], #240]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #240]\n\t"
        /* a[i+61] += m[61] * mu */
        "LDR	r12, [%[m], #244]\n\t"
        "LDR	r11, [%[a], #244]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #244]\n\t"
        /* a[i+62] += m[62] * mu */
        "LDR	r12, [%[m], #248]\n\t"
        "LDR	r11, [%[a], #248]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #248]\n\t"
        /* a[i+63] += m[63] * mu */
        "LDR	r12, [%[m], #252]\n\t"
        "LDR	r11, [%[a], #252]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "LDR	lr, [%[a], #256]\n\t"
        "MOV	r12, #0x0\n\t"
        "UMAAL	r3, lr, r12, r12\n\t"
        "STR	r11, [%[a], #252]\n\t"
        "ADDS	r3, r3, r5\n\t"
        "ADC	r5, lr, #0x0\n\t"
        "STR	r3, [%[a], #256]\n\t"
        /* i += 1 */
        "ADD	r4, r4, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r4, #0x100\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_64_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_2048_mont_reduce_64_word\n\t"
#else
        "BLT.W	L_sp_2048_mont_reduce_64_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r6, [%[a]]\n\t"
        "STR	r7, [%[a], #4]\n\t"
        "STR	r8, [%[a], #8]\n\t"
        "STR	r9, [%[a], #12]\n\t"
        "STR	r10, [%[a], #16]\n\t"
        "MOV	%[mp], r5\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_64_word:\n\t"
#else
    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_mont_reduce_64_mul:\n\t"
#else
    "L_sp_2048_mont_reduce_64_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0x100\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_64_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mont_reduce_64_mul\n\t"
#else
        "BLT.N	L_sp_2048_mont_reduce_64_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #256]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #256]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0x100\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_mont_reduce_64_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_mont_reduce_64_word\n\t"
#else
        "BLT.N	L_sp_2048_mont_reduce_64_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#endif
/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_2048_mul_64(r, a, b);
    sp_2048_mont_reduce_64(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_2048_sqr_64(r, a);
    sp_2048_mont_reduce_64(r, m, mp);
}

#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r11, #0x0\n\t"
        "ADD	r12, %[a], #0x100\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_sub_64_word:\n\t"
#else
    "L_sp_2048_sub_64_word_%=:\n\t"
#endif
        "RSBS	r11, r11, #0x0\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	r11, r3, r3\n\t"
        "CMP	%[a], r12\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_2048_sub_64_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_2048_sub_64_word\n\t"
#else
        "BNE.N	L_sp_2048_sub_64_word_%=\n\t"
#endif
        "MOV	%[r], r11\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SUBS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_USE_UDIV
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r8, %[div], #16\n\t"
        "ADD	r5, r8, #0x1\n\t"
        "UDIV	r6, %[d1], r5\n\t"
        "LSL	r7, %[div], #16\n\t"
        "LSL	r6, r6, #16\n\t"
        "UMULL	r3, r4, %[div], r6\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "SUBS	r3, %[d1], r5\n\t"
        "SBC	r9, r9, r9\n\t"
        "ADD	r9, r9, #0x1\n\t"
        "RSB	r10, r9, #0x0\n\t"
        "LSL	r9, r9, #16\n\t"
        "AND	r7, r7, r10\n\t"
        "AND	r8, r8, r10\n\t"
        "SUBS	%[d0], %[d0], r7\n\t"
        "ADD	r6, r6, r9\n\t"
        "SBC	%[d1], %[d1], r8\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "UMULL	r3, r4, %[div], r3\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "MUL	r3, %[div], r3\n\t"
        "SUB	%[d0], %[d0], r3\n\t"
        "UDIV	r3, %[d0], %[div]\n\t"
        "ADD	%[d1], r6, r3\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#else
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r5, %[div], #1\n\t"
        "ADD	r5, r5, #0x1\n\t"
        "MOV	r6, %[d0]\n\t"
        "MOV	r7, %[d1]\n\t"
        /* Do top 32 */
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "MOV	r3, #0x0\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        /* Next 30 bits */
        "MOV	r4, #0x1d\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_div_2048_word_64_bit:\n\t"
#else
    "L_div_2048_word_64_bit_%=:\n\t"
#endif
        "LSLS	r6, r6, #1\n\t"
        "ADC	r7, r7, r7\n\t"
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "ADD	r3, r3, r3\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        "SUBS	r4, r4, #0x1\n\t"
#if defined(__GNUC__)
        "BPL	L_div_2048_word_64_bit_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BPL.N	L_div_2048_word_64_bit\n\t"
#else
        "BPL.N	L_div_2048_word_64_bit_%=\n\t"
#endif
        "ADD	r3, r3, r3\n\t"
        "ADD	r3, r3, #0x1\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "SUBS	r8, %[div], r9\n\t"
        "SBC	r8, r8, r8\n\t"
        "SUB	%[d1], r3, r8\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#endif
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d,
        sp_digit* m, sp_digit* r)
{
    sp_digit t1[128], t2[65];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
    for (i = 63; i > 0; i--) {
        if (t1[i + 64] != d[i])
            break;
    }
    if (t1[i + 64] >= d[i]) {
        sp_2048_sub_in_place_64(&t1[64], d);
    }
    for (i = 63; i >= 0; i--) {
        if (t1[64 + i] == div) {
            r1 = SP_DIGIT_MAX;
        }
        else {
            r1 = div_2048_word_64(t1[64 + i], t1[64 + i - 1], div);
        }

        sp_2048_mul_d_64(t2, d, r1);
        t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2);
        t1[64 + i] -= t2[64];
        if (t1[64 + i] != 0) {
            t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d);
            if (t1[64 + i] != 0)
                t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d);
        }
    }

    for (i = 63; i > 0; i--) {
        if (t1[i] != d[i])
            break;
    }
    if (t1[i] >= d[i]) {
        sp_2048_sub_64(r, t1, d);
    }
    else {
        XMEMCPY(r, t1, sizeof(*t1) * 64);
    }

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_2048_div_64_cond(a, m, NULL, r);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY)
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<64; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 64; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_int32 sp_2048_cmp_64(const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r2, #0xffffffff\n\t"
        "MOV	r8, #0x1\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r3, #0xffffffff\n\t"
#ifdef WOLFSSL_SP_SMALL
        "MOV	r6, #0xfc\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_cmp_64_words:\n\t"
#else
    "L_sp_2048_cmp_64_words_%=:\n\t"
#endif
        "LDR	r4, [%[a], r6]\n\t"
        "LDR	r5, [%[b], r6]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "SUBS	r6, r6, #0x4\n\t"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "bcs	L_sp_2048_cmp_64_words\n\t"
#else
        "bcs	L_sp_2048_cmp_64_words_%=\n\t"
#endif
        "EOR	r2, r2, r3\n\t"
#else
        "LDR	r4, [%[a], #252]\n\t"
        "LDR	r5, [%[b], #252]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #248]\n\t"
        "LDR	r5, [%[b], #248]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #244]\n\t"
        "LDR	r5, [%[b], #244]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #240]\n\t"
        "LDR	r5, [%[b], #240]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #236]\n\t"
        "LDR	r5, [%[b], #236]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #232]\n\t"
        "LDR	r5, [%[b], #232]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #228]\n\t"
        "LDR	r5, [%[b], #228]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #224]\n\t"
        "LDR	r5, [%[b], #224]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #220]\n\t"
        "LDR	r5, [%[b], #220]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #216]\n\t"
        "LDR	r5, [%[b], #216]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #212]\n\t"
        "LDR	r5, [%[b], #212]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #208]\n\t"
        "LDR	r5, [%[b], #208]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #204]\n\t"
        "LDR	r5, [%[b], #204]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #200]\n\t"
        "LDR	r5, [%[b], #200]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #196]\n\t"
        "LDR	r5, [%[b], #196]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #192]\n\t"
        "LDR	r5, [%[b], #192]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #188]\n\t"
        "LDR	r5, [%[b], #188]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #184]\n\t"
        "LDR	r5, [%[b], #184]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #180]\n\t"
        "LDR	r5, [%[b], #180]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #176]\n\t"
        "LDR	r5, [%[b], #176]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #172]\n\t"
        "LDR	r5, [%[b], #172]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #168]\n\t"
        "LDR	r5, [%[b], #168]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #164]\n\t"
        "LDR	r5, [%[b], #164]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #160]\n\t"
        "LDR	r5, [%[b], #160]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #156]\n\t"
        "LDR	r5, [%[b], #156]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #152]\n\t"
        "LDR	r5, [%[b], #152]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #148]\n\t"
        "LDR	r5, [%[b], #148]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #144]\n\t"
        "LDR	r5, [%[b], #144]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #140]\n\t"
        "LDR	r5, [%[b], #140]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #136]\n\t"
        "LDR	r5, [%[b], #136]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #132]\n\t"
        "LDR	r5, [%[b], #132]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #128]\n\t"
        "LDR	r5, [%[b], #128]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #124]\n\t"
        "LDR	r5, [%[b], #124]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #120]\n\t"
        "LDR	r5, [%[b], #120]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #116]\n\t"
        "LDR	r5, [%[b], #116]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #112]\n\t"
        "LDR	r5, [%[b], #112]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #108]\n\t"
        "LDR	r5, [%[b], #108]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #104]\n\t"
        "LDR	r5, [%[b], #104]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #100]\n\t"
        "LDR	r5, [%[b], #100]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #96]\n\t"
        "LDR	r5, [%[b], #96]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #92]\n\t"
        "LDR	r5, [%[b], #92]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #88]\n\t"
        "LDR	r5, [%[b], #88]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #84]\n\t"
        "LDR	r5, [%[b], #84]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #80]\n\t"
        "LDR	r5, [%[b], #80]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #76]\n\t"
        "LDR	r5, [%[b], #76]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #72]\n\t"
        "LDR	r5, [%[b], #72]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #68]\n\t"
        "LDR	r5, [%[b], #68]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #64]\n\t"
        "LDR	r5, [%[b], #64]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #60]\n\t"
        "LDR	r5, [%[b], #60]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #56]\n\t"
        "LDR	r5, [%[b], #56]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #52]\n\t"
        "LDR	r5, [%[b], #52]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #48]\n\t"
        "LDR	r5, [%[b], #48]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #44]\n\t"
        "LDR	r5, [%[b], #44]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #40]\n\t"
        "LDR	r5, [%[b], #40]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #36]\n\t"
        "LDR	r5, [%[b], #36]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #32]\n\t"
        "LDR	r5, [%[b], #32]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #28]\n\t"
        "LDR	r5, [%[b], #28]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #24]\n\t"
        "LDR	r5, [%[b], #24]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #20]\n\t"
        "LDR	r5, [%[b], #20]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #16]\n\t"
        "LDR	r5, [%[b], #16]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #12]\n\t"
        "LDR	r5, [%[b], #12]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #8]\n\t"
        "LDR	r5, [%[b], #8]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #4]\n\t"
        "LDR	r5, [%[b], #4]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[b]]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "EOR	r2, r2, r3\n\t"
#endif /*WOLFSSL_SP_SMALL */
        "MOV	%[a], r2\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)a;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d,
        sp_digit* m, sp_digit* r)
{
    sp_digit t1[128], t2[65];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
    r1 = sp_2048_cmp_64(&t1[64], d) >= 0;
    sp_2048_cond_sub_64(&t1[64], &t1[64], d, (sp_digit)0 - r1);
    for (i = 63; i >= 0; i--) {
        volatile sp_digit mask = (sp_digit)0 - (t1[64 + i] == div);
        sp_digit hi = t1[64 + i] + mask;
        r1 = div_2048_word_64(hi, t1[64 + i - 1], div);
        r1 |= mask;

        sp_2048_mul_d_64(t2, d, r1);
        t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2);
        t1[64 + i] -= t2[64];
        sp_2048_mask_64(t2, d, t1[64 + i]);
        t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], t2);
        sp_2048_mask_64(t2, d, t1[64 + i]);
        t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], t2);
    }

    r1 = sp_2048_cmp_64(t1, d) >= 0;
    sp_2048_cond_sub_64(r, t1, d, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_mod_64(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_2048_div_64(a, m, NULL, r);
}

#endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
                                                     defined(WOLFSSL_HAVE_SP_DH)
#ifdef WOLFSSL_SP_SMALL
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[8 * 128];
#endif
    sp_digit* t[8];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 128), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<8; i++) {
            t[i] = td + i * 128;
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_64(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
        if (reduceA != 0) {
            err = sp_2048_mod_64(t[1] + 64, a, m);
            if (err == MP_OKAY) {
                err = sp_2048_mod_64(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
            err = sp_2048_mod_64(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 3;
        if (c == 32) {
            c = 29;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
        for (; i>=0 || c>=3; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 29);
                n <<= 3;
                c = 29;
            }
            else if (c < 3) {
                y = (byte)(n >> 29);
                n = e[i--];
                c = 3 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 29) & 0x7);
                n <<= 3;
                c -= 3;
            }

            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);

            sp_2048_mont_mul_64(r, r, t[y], m, mp);
        }

        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
        sp_2048_mont_reduce_64(r, m, mp);

        mask = (sp_digit)0 - (sp_2048_cmp_64(r, m) >= 0);
        sp_2048_cond_sub_64(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#else
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[16 * 128];
#endif
    sp_digit* t[16];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (16 * 128), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++) {
            t[i] = td + i * 128;
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_64(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
        if (reduceA != 0) {
            err = sp_2048_mod_64(t[1] + 64, a, m);
            if (err == MP_OKAY) {
                err = sp_2048_mod_64(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
            err = sp_2048_mod_64(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_64(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_64(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_64(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_64(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_64(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_64(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_64(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_64(t[15], t[ 8], t[ 7], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 4;
        if (c == 32) {
            c = 28;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
        for (; i>=0 || c>=4; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 28);
                n <<= 4;
                c = 28;
            }
            else if (c < 4) {
                y = (byte)(n >> 28);
                n = e[i--];
                c = 4 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }

            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);

            sp_2048_mont_mul_64(r, r, t[y], m, mp);
        }

        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
        sp_2048_mont_reduce_64(r, m, mp);

        mask = (sp_digit)0 - (sp_2048_cmp_64(r, m) >= 0);
        sp_2048_cond_sub_64(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* WOLFSSL_SP_SMALL */
#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
    const mp_int* mm, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[64 * 5];
#endif
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_digit *ah = NULL;
    sp_digit e[1] = {0};
    int err = MP_OKAY;

    if (*outLen < 256) {
        err = MP_TO_E;
    }
    else if (mp_count_bits(em) > 32 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mm)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 64 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        ah = a + 64;
        r = a + 64 * 2;
        m = r + 64 * 2;

        sp_2048_from_bin(ah, 64, in, inLen);
#if DIGIT_BIT >= 32
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 64, mm);

        if (e[0] == 0x10001) {
            int i;
            sp_digit mp;

            sp_2048_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 64);
            err = sp_2048_mod_64_cond(r, a, m);
            /* Montgomery form: r = a.R mod m */

            if (err == MP_OKAY) {
                /* r = a ^ 0x10000 => r = a squared 16 times */
                for (i = 15; i >= 0; i--) {
                    sp_2048_mont_sqr_64(r, r, m, mp);
                }
                /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m
                 * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m
                 */
                sp_2048_mont_mul_64(r, r, ah, m, mp);

                for (i = 63; i > 0; i--) {
                    if (r[i] != m[i]) {
                        break;
                    }
                }
                if (r[i] >= m[i]) {
                    sp_2048_sub_in_place_64(r, m);
                }
            }
        }
        else if (e[0] == 0x3) {
            if (err == MP_OKAY) {
                sp_2048_sqr_64(r, ah);
                err = sp_2048_mod_64_cond(r, r, m);
            }
            if (err == MP_OKAY) {
                sp_2048_mul_64(r, ah, r);
                err = sp_2048_mod_64_cond(r, r, m);
            }
        }
        else {
            int i;
            sp_digit mp;

            sp_2048_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 64);
            err = sp_2048_mod_64_cond(a, a, m);

            if (err == MP_OKAY) {
                for (i = 31; i >= 0; i--) {
                    if (e[0] >> i) {
                        break;
                    }
                }

                XMEMCPY(r, a, sizeof(sp_digit) * 64);
                for (i--; i >= 0; i--) {
                    sp_2048_mont_sqr_64(r, r, m, mp);
                    if (((e[0] >> i) & 1) == 1) {
                        sp_2048_mont_mul_64(r, r, a, m, mp);
                    }
                }
                XMEMSET(&r[64], 0, sizeof(sp_digit) * 64);
                sp_2048_mont_reduce_64(r, m, mp);

                for (i = 63; i > 0; i--) {
                    if (r[i] != m[i]) {
                        break;
                    }
                }
                if (r[i] >= m[i]) {
                    sp_2048_sub_in_place_64(r, m);
                }
            }
        }
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_64(r, out);
        *outLen = 256;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
}

#ifndef WOLFSSL_RSA_PUBLIC_ONLY
#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r5, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_2048_cond_add_32_words:\n\t"
#else
    "L_sp_2048_cond_add_32_words_%=:\n\t"
#endif
        "ADDS	r5, r5, #0xffffffff\n\t"
        "LDR	r6, [%[a], r4]\n\t"
        "LDR	r7, [%[b], r4]\n\t"
        "AND	r7, r7, %[m]\n\t"
        "ADCS	r6, r6, r7\n\t"
        "ADC	r5, r8, r8\n\t"
        "STR	r6, [%[r], r4]\n\t"
        "ADD	r4, r4, #0x4\n\t"
        "CMP	r4, #0x80\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_2048_cond_add_32_words_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_2048_cond_add_32_words\n\t"
#else
        "BLT.N	L_sp_2048_cond_add_32_words_%=\n\t"
#endif
        "MOV	%[r], r5\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r10, #0x0\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADDS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "ADC	%[r], r10, r10\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
{
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit  d[64 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 256U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 2048) {
           err = MP_READ_E;
        }
        else if (inLen > 256) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 2048) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 64 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 64;
        m = a + 128;
        r = a;

        sp_2048_from_bin(a, 64, in, inLen);
        sp_2048_from_mp(d, 64, dm);
        sp_2048_from_mp(m, 64, mm);
        err = sp_2048_mod_exp_64(r, a, d, 2048, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_64(r, out);
        *outLen = 256;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 64);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[32 * 11];
#endif
    sp_digit* p = NULL;
    sp_digit* q = NULL;
    sp_digit* dp = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    sp_digit* qi = NULL;
    sp_digit* dq = NULL;
    sp_digit c;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 256) {
        err = MP_TO_E;
    }
    else if (inLen > 256 || mp_count_bits(mm) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mm)) {
        err = MP_VAL;
    }
    else if (mp_iseven(pm)) {
        err = MP_VAL;
    }
    else if (mp_iseven(qm)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 11, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = a + 64 * 2;
        q = p + 32;
        qi = dq = dp = q + 32;
        tmpa = qi + 32;
        tmpb = tmpa + 64;
        r = a;

        sp_2048_from_bin(a, 64, in, inLen);
        sp_2048_from_mp(p, 32, pm);
        sp_2048_from_mp(q, 32, qm);
        sp_2048_from_mp(dp, 32, dpm);

        err = sp_2048_mod_exp_32(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(dq, 32, dqm);
        err = sp_2048_mod_exp_32(tmpb, a, dq, 1024, q, 1);
    }

    if (err == MP_OKAY) {
        c = sp_2048_sub_in_place_32(tmpa, tmpb);
        c += sp_2048_cond_add_32(tmpa, tmpa, p, c);
        sp_2048_cond_add_32(tmpa, tmpa, p, c);

        sp_2048_from_mp(qi, 32, qim);
        sp_2048_mul_32(tmpa, tmpa, qi);
        err = sp_2048_mod_32(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_2048_mul_32(tmpa, q, tmpa);
        XMEMSET(&tmpb[32], 0, sizeof(sp_digit) * 32);
        sp_2048_add_64(r, tmpb, tmpa);

        sp_2048_to_bin_64(r, out);
        *outLen = 256;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 32 * 11);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
    #endif
    }
#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
    return err;
}
#endif /* WOLFSSL_RSA_PUBLIC_ONLY */
#endif /* WOLFSSL_HAVE_SP_RSA */
#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (2048 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 32
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 64);
        r->used = 64;
        mp_clamp(r);
#elif DIGIT_BIT < 32
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 64; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 64; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 32 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 32 - s;
            }
            else {
                s += 32;
            }
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[128];
    sp_digit e[64];
    sp_digit m[64];
    sp_digit* r = b;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048) {
        err = MP_READ_E;
    }
    else if (expBits > 2048) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 64, base);
        sp_2048_from_mp(e, 64, exp);
        sp_2048_from_mp(m, 64, mod);

        err = sp_2048_mod_exp_64(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

#ifdef WOLFSSL_HAVE_SP_DH

#ifdef HAVE_FFDHE_2048
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_2048_lshift_64(sp_digit* r_p, const sp_digit* a_p, byte n_p)
#else
static void sp_2048_lshift_64(sp_digit* r, const sp_digit* a, byte n)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register byte n __asm__ ("r2") = (byte)n_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "RSB	r7, %[n], #0x1f\n\t"
        "LDR	r5, [%[a], #252]\n\t"
        "LSR	r6, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r6, r6, r7\n\t"
        "LDR	r4, [%[a], #248]\n\t"
        "STR	r6, [%[r], #256]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #244]\n\t"
        "STR	r5, [%[r], #252]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #240]\n\t"
        "STR	r4, [%[r], #248]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #236]\n\t"
        "STR	r6, [%[r], #244]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #232]\n\t"
        "STR	r5, [%[r], #240]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #228]\n\t"
        "STR	r4, [%[r], #236]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #224]\n\t"
        "STR	r6, [%[r], #232]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #220]\n\t"
        "STR	r5, [%[r], #228]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #216]\n\t"
        "STR	r4, [%[r], #224]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #212]\n\t"
        "STR	r6, [%[r], #220]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #208]\n\t"
        "STR	r5, [%[r], #216]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #204]\n\t"
        "STR	r4, [%[r], #212]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #200]\n\t"
        "STR	r6, [%[r], #208]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #196]\n\t"
        "STR	r5, [%[r], #204]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #192]\n\t"
        "STR	r4, [%[r], #200]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #188]\n\t"
        "STR	r6, [%[r], #196]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #184]\n\t"
        "STR	r5, [%[r], #192]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #180]\n\t"
        "STR	r4, [%[r], #188]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #176]\n\t"
        "STR	r6, [%[r], #184]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #172]\n\t"
        "STR	r5, [%[r], #180]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #168]\n\t"
        "STR	r4, [%[r], #176]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #164]\n\t"
        "STR	r6, [%[r], #172]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #160]\n\t"
        "STR	r5, [%[r], #168]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #156]\n\t"
        "STR	r4, [%[r], #164]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #152]\n\t"
        "STR	r6, [%[r], #160]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #148]\n\t"
        "STR	r5, [%[r], #156]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #144]\n\t"
        "STR	r4, [%[r], #152]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #140]\n\t"
        "STR	r6, [%[r], #148]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #136]\n\t"
        "STR	r5, [%[r], #144]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #132]\n\t"
        "STR	r4, [%[r], #140]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #128]\n\t"
        "STR	r6, [%[r], #136]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #124]\n\t"
        "STR	r5, [%[r], #132]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #120]\n\t"
        "STR	r4, [%[r], #128]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #116]\n\t"
        "STR	r6, [%[r], #124]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #112]\n\t"
        "STR	r5, [%[r], #120]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #108]\n\t"
        "STR	r4, [%[r], #116]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #104]\n\t"
        "STR	r6, [%[r], #112]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #100]\n\t"
        "STR	r5, [%[r], #108]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #96]\n\t"
        "STR	r4, [%[r], #104]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #92]\n\t"
        "STR	r6, [%[r], #100]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #88]\n\t"
        "STR	r5, [%[r], #96]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #84]\n\t"
        "STR	r4, [%[r], #92]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #80]\n\t"
        "STR	r6, [%[r], #88]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #76]\n\t"
        "STR	r5, [%[r], #84]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #72]\n\t"
        "STR	r4, [%[r], #80]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #68]\n\t"
        "STR	r6, [%[r], #76]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #64]\n\t"
        "STR	r5, [%[r], #72]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #60]\n\t"
        "STR	r4, [%[r], #68]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #56]\n\t"
        "STR	r6, [%[r], #64]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #52]\n\t"
        "STR	r5, [%[r], #60]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #48]\n\t"
        "STR	r4, [%[r], #56]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #44]\n\t"
        "STR	r6, [%[r], #52]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #40]\n\t"
        "STR	r5, [%[r], #48]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #36]\n\t"
        "STR	r4, [%[r], #44]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #32]\n\t"
        "STR	r6, [%[r], #40]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #28]\n\t"
        "STR	r5, [%[r], #36]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #24]\n\t"
        "STR	r4, [%[r], #32]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #20]\n\t"
        "STR	r6, [%[r], #28]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #16]\n\t"
        "STR	r5, [%[r], #24]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #12]\n\t"
        "STR	r4, [%[r], #20]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #8]\n\t"
        "STR	r6, [%[r], #16]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #4]\n\t"
        "STR	r5, [%[r], #12]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a]]\n\t"
        "STR	r4, [%[r], #8]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "STR	r5, [%[r]]\n\t"
        "STR	r6, [%[r], #4]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
        :
        : "memory", "r4", "r5", "r6", "r3", "r7", "cc"
    );
}

/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even.
 */
static int sp_2048_mod_exp_2_64(sp_digit* r, const sp_digit* e, int bits,
        const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[193];
#endif
    sp_digit* norm = NULL;
    sp_digit* tmp = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit o;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 193, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        tmp = td + 128;

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_64(norm, m);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 5;
        if (c == 32) {
            c = 27;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        sp_2048_lshift_64(r, norm, y);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 27);
                n <<= 5;
                c = 27;
            }
            else if (c < 5) {
                y = (byte)(n >> 27);
                n = e[i--];
                c = 5 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);

            sp_2048_lshift_64(r, r, y);
            sp_2048_mul_d_64(tmp, norm, r[64]);
            r[64] = 0;
            o = sp_2048_add_64(r, r, tmp);
            sp_2048_cond_sub_64(r, r, m, (sp_digit)0 - o);
        }

        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
        sp_2048_mont_reduce_64(r, m, mp);

        mask = (sp_digit)0 - (sp_2048_cmp_64(r, m) >= 0);
        sp_2048_cond_sub_64(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* HAVE_FFDHE_2048 */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 256 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
    const mp_int* mod, byte* out, word32* outLen)
{
    int err = MP_OKAY;
    sp_digit b[128];
    sp_digit e[64];
    sp_digit m[64];
    sp_digit* r = b;
    word32 i;

    if (mp_count_bits(base) > 2048) {
        err = MP_READ_E;
    }
    else if (expLen > 256) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 64, base);
        sp_2048_from_bin(e, 64, exp, expLen);
        sp_2048_from_mp(m, 64, mod);

    #ifdef HAVE_FFDHE_2048
        if (base->used == 1 && base->dp[0] == 2 && m[63] == (sp_digit)-1)
            err = sp_2048_mod_exp_2_64(r, e, expLen * 8, m);
        else
    #endif
            err = sp_2048_mod_exp_64(r, b, e, expLen * 8, m, 0);

    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_64(r, out);
        *outLen = 256;
        for (i=0; i<256 && out[i] == 0; i++) {
            /* Search for first non-zero. */
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);

    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[64];
    sp_digit e[32];
    sp_digit m[32];
    sp_digit* r = b;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 1024) {
        err = MP_READ_E;
    }
    else if (expBits > 1024) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 1024) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 32, base);
        sp_2048_from_mp(e, 32, exp);
        sp_2048_from_mp(m, 32, mod);

        err = sp_2048_mod_exp_32(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        XMEMSET(r + 32, 0, sizeof(*r) * 32U);
        err = sp_2048_to_mp(r, res);
        res->used = mod->used;
        mp_clamp(res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */

#endif /* !WOLFSSL_SP_NO_2048 */

#ifndef WOLFSSL_SP_NO_3072
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j;
    byte* d;

    j = 0;
    for (i = n - 1; i >= 3; i -= 4) {
        r[j]  = ((sp_digit)a[i - 0] <<  0) |
                ((sp_digit)a[i - 1] <<  8) |
                ((sp_digit)a[i - 2] << 16) |
                ((sp_digit)a[i - 3] << 24);
        j++;
    }

    if (i >= 0) {
        r[j] = 0;

        d = (byte*)(r + j);
#ifdef BIG_ENDIAN_ORDER
        switch (i) {
            case 2: d[1] = *(a++); //fallthrough
            case 1: d[2] = *(a++); //fallthrough
            case 0: d[3] = *a    ; //fallthrough
        }
#else
        switch (i) {
            case 2: d[2] = a[2]; //fallthrough
            case 1: d[1] = a[1]; //fallthrough
            case 0: d[0] = a[0]; //fallthrough
        }
#endif
        j++;
    }

    for (; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 32
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 31);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 31);
    }
#elif DIGIT_BIT > 32
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0xffffffff;
        s = 32U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 32U) <= (word32)DIGIT_BIT) {
            s += 32U;
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 32) {
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 32 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 384
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_3072_to_bin_96(sp_digit* r, byte* a)
{
    int i;
    int j = 0;

    for (i = 95; i >= 0; i--) {
        a[j++] = r[i] >> 24;
        a[j++] = r[i] >> 16;
        a[j++] = r[i] >> 8;
        a[j++] = r[i] >> 0;
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH)
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_3072_norm_96(a)

#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_3072_norm_96(a)

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x30\n\t"
        /* A[0] * B[0] */
        "LDR	r11, [%[a]]\n\t"
        "LDR	r12, [%[b]]\n\t"
        "UMULL	r3, r4, r11, r12\n\t"
        "MOV	r5, #0x0\n\t"
        "STR	r3, [sp]\n\t"
        /* A[0] * B[1] */
        "LDR	r9, [%[b], #4]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[1] * B[0] */
        "LDR	r8, [%[a], #4]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "STR	r4, [sp, #4]\n\t"
        /* A[2] * B[0] */
        "LDR	r8, [%[a], #8]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[1] * B[1] */
        "LDR	r11, [%[a], #4]\n\t"
        "LDR	r12, [%[b], #4]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[0] * B[2] */
        "LDR	r8, [%[a]]\n\t"
        "LDR	r9, [%[b], #8]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r5, [sp, #8]\n\t"
        /* A[0] * B[3] */
        "LDR	r9, [%[b], #12]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "MOV	r5, #0x0\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[1] * B[2] */
        "LDR	r9, [%[b], #8]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[2] * B[1] */
        "LDR	r8, [%[a], #8]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[3] * B[0] */
        "LDR	r8, [%[a], #12]\n\t"
        "LDR	r9, [%[b]]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [sp, #12]\n\t"
        /* A[4] * B[0] */
        "LDR	r8, [%[a], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[3] * B[1] */
        "LDR	r8, [%[a], #12]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[2] * B[2] */
        "LDR	r11, [%[a], #8]\n\t"
        "LDR	r12, [%[b], #8]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[1] * B[3] */
        "LDR	r8, [%[a], #4]\n\t"
        "LDR	r9, [%[b], #12]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[0] * B[4] */
        "LDR	r8, [%[a]]\n\t"
        "LDR	r9, [%[b], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "STR	r4, [sp, #16]\n\t"
        /* A[0] * B[5] */
        "LDR	r9, [%[b], #20]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[1] * B[4] */
        "LDR	r8, [%[a], #4]\n\t"
        "LDR	r9, [%[b], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[2] * B[3] */
        "LDR	r9, [%[b], #12]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[3] * B[2] */
        "LDR	r8, [%[a], #12]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[4] * B[1] */
        "LDR	r8, [%[a], #16]\n\t"
        "LDR	r9, [%[b], #4]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[5] * B[0] */
        "LDR	r8, [%[a], #20]\n\t"
        "LDR	r9, [%[b]]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r5, [sp, #20]\n\t"
        /* A[6] * B[0] */
        "LDR	r8, [%[a], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "MOV	r5, #0x0\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[5] * B[1] */
        "LDR	r8, [%[a], #20]\n\t"
        "LDR	r9, [%[b], #4]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[4] * B[2] */
        "LDR	r8, [%[a], #16]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[3] * B[3] */
        "LDR	r11, [%[a], #12]\n\t"
        "LDR	r12, [%[b], #12]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[2] * B[4] */
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[b], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[1] * B[5] */
        "LDR	r8, [%[a], #4]\n\t"
        "LDR	r9, [%[b], #20]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[0] * B[6] */
        "LDR	r8, [%[a]]\n\t"
        "LDR	r9, [%[b], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [sp, #24]\n\t"
        /* A[0] * B[7] */
        "LDR	r9, [%[b], #28]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[1] * B[6] */
        "LDR	r8, [%[a], #4]\n\t"
        "LDR	r9, [%[b], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[2] * B[5] */
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[b], #20]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[3] * B[4] */
        "LDR	r9, [%[b], #16]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[4] * B[3] */
        "LDR	r8, [%[a], #16]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[5] * B[2] */
        "LDR	r8, [%[a], #20]\n\t"
        "LDR	r9, [%[b], #8]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[6] * B[1] */
        "LDR	r8, [%[a], #24]\n\t"
        "LDR	r9, [%[b], #4]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[7] * B[0] */
        "LDR	r8, [%[a], #28]\n\t"
        "LDR	r9, [%[b]]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "STR	r4, [sp, #28]\n\t"
        /* A[8] * B[0] */
        "LDR	r8, [%[a], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[7] * B[1] */
        "LDR	r8, [%[a], #28]\n\t"
        "LDR	r9, [%[b], #4]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[6] * B[2] */
        "LDR	r8, [%[a], #24]\n\t"
        "LDR	r9, [%[b], #8]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[5] * B[3] */
        "LDR	r8, [%[a], #20]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[4] * B[4] */
        "LDR	r11, [%[a], #16]\n\t"
        "LDR	r12, [%[b], #16]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[3] * B[5] */
        "LDR	r8, [%[a], #12]\n\t"
        "LDR	r9, [%[b], #20]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[2] * B[6] */
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[b], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[1] * B[7] */
        "LDR	r8, [%[a], #4]\n\t"
        "LDR	r9, [%[b], #28]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[0] * B[8] */
        "LDR	r8, [%[a]]\n\t"
        "LDR	r9, [%[b], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r5, [sp, #32]\n\t"
        /* A[0] * B[9] */
        "LDR	r9, [%[b], #36]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "MOV	r5, #0x0\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[1] * B[8] */
        "LDR	r8, [%[a], #4]\n\t"
        "LDR	r9, [%[b], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[2] * B[7] */
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[b], #28]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[3] * B[6] */
        "LDR	r8, [%[a], #12]\n\t"
        "LDR	r9, [%[b], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[4] * B[5] */
        "LDR	r9, [%[b], #20]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[5] * B[4] */
        "LDR	r8, [%[a], #20]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[6] * B[3] */
        "LDR	r8, [%[a], #24]\n\t"
        "LDR	r9, [%[b], #12]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[7] * B[2] */
        "LDR	r8, [%[a], #28]\n\t"
        "LDR	r9, [%[b], #8]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[8] * B[1] */
        "LDR	r8, [%[a], #32]\n\t"
        "LDR	r9, [%[b], #4]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[9] * B[0] */
        "LDR	r8, [%[a], #36]\n\t"
        "LDR	r9, [%[b]]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [sp, #36]\n\t"
        /* A[10] * B[0] */
        "LDR	r8, [%[a], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[9] * B[1] */
        "LDR	r8, [%[a], #36]\n\t"
        "LDR	r9, [%[b], #4]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[8] * B[2] */
        "LDR	r8, [%[a], #32]\n\t"
        "LDR	r9, [%[b], #8]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[7] * B[3] */
        "LDR	r8, [%[a], #28]\n\t"
        "LDR	r9, [%[b], #12]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[6] * B[4] */
        "LDR	r8, [%[a], #24]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[5] * B[5] */
        "LDR	r11, [%[a], #20]\n\t"
        "LDR	r12, [%[b], #20]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[4] * B[6] */
        "LDR	r8, [%[a], #16]\n\t"
        "LDR	r9, [%[b], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[3] * B[7] */
        "LDR	r8, [%[a], #12]\n\t"
        "LDR	r9, [%[b], #28]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[2] * B[8] */
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[b], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[1] * B[9] */
        "LDR	r8, [%[a], #4]\n\t"
        "LDR	r9, [%[b], #36]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[0] * B[10] */
        "LDR	r8, [%[a]]\n\t"
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "STR	r4, [sp, #40]\n\t"
        /* A[0] * B[11] */
        "LDR	r9, [%[b], #44]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[1] * B[10] */
        "LDR	r8, [%[a], #4]\n\t"
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[2] * B[9] */
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[b], #36]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[3] * B[8] */
        "LDR	r8, [%[a], #12]\n\t"
        "LDR	r9, [%[b], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[4] * B[7] */
        "LDR	r8, [%[a], #16]\n\t"
        "LDR	r9, [%[b], #28]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[5] * B[6] */
        "LDR	r9, [%[b], #24]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[6] * B[5] */
        "LDR	r8, [%[a], #24]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[7] * B[4] */
        "LDR	r8, [%[a], #28]\n\t"
        "LDR	r9, [%[b], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[8] * B[3] */
        "LDR	r8, [%[a], #32]\n\t"
        "LDR	r9, [%[b], #12]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[9] * B[2] */
        "LDR	r8, [%[a], #36]\n\t"
        "LDR	r9, [%[b], #8]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[10] * B[1] */
        "LDR	r8, [%[a], #40]\n\t"
        "LDR	r9, [%[b], #4]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[11] * B[0] */
        "LDR	r8, [%[a], #44]\n\t"
        "LDR	r9, [%[b]]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r5, [sp, #44]\n\t"
        /* A[11] * B[1] */
        "LDR	r9, [%[b], #4]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "MOV	r5, #0x0\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[10] * B[2] */
        "LDR	r8, [%[a], #40]\n\t"
        "LDR	r9, [%[b], #8]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[9] * B[3] */
        "LDR	r8, [%[a], #36]\n\t"
        "LDR	r9, [%[b], #12]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[8] * B[4] */
        "LDR	r8, [%[a], #32]\n\t"
        "LDR	r9, [%[b], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[7] * B[5] */
        "LDR	r8, [%[a], #28]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[6] * B[6] */
        "LDR	r11, [%[a], #24]\n\t"
        "LDR	r12, [%[b], #24]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[5] * B[7] */
        "LDR	r8, [%[a], #20]\n\t"
        "LDR	r9, [%[b], #28]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[4] * B[8] */
        "LDR	r8, [%[a], #16]\n\t"
        "LDR	r9, [%[b], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[3] * B[9] */
        "LDR	r8, [%[a], #12]\n\t"
        "LDR	r9, [%[b], #36]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[2] * B[10] */
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[1] * B[11] */
        "LDR	r8, [%[a], #4]\n\t"
        "LDR	r9, [%[b], #44]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [%[r], #48]\n\t"
        /* A[2] * B[11] */
        "LDR	r8, [%[a], #8]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[3] * B[10] */
        "LDR	r8, [%[a], #12]\n\t"
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[4] * B[9] */
        "LDR	r8, [%[a], #16]\n\t"
        "LDR	r9, [%[b], #36]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[5] * B[8] */
        "LDR	r8, [%[a], #20]\n\t"
        "LDR	r9, [%[b], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[6] * B[7] */
        "LDR	r9, [%[b], #28]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[7] * B[6] */
        "LDR	r8, [%[a], #28]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[8] * B[5] */
        "LDR	r8, [%[a], #32]\n\t"
        "LDR	r9, [%[b], #20]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[9] * B[4] */
        "LDR	r8, [%[a], #36]\n\t"
        "LDR	r9, [%[b], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[10] * B[3] */
        "LDR	r8, [%[a], #40]\n\t"
        "LDR	r9, [%[b], #12]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[11] * B[2] */
        "LDR	r8, [%[a], #44]\n\t"
        "LDR	r9, [%[b], #8]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "STR	r4, [%[r], #52]\n\t"
        /* A[11] * B[3] */
        "LDR	r9, [%[b], #12]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[10] * B[4] */
        "LDR	r8, [%[a], #40]\n\t"
        "LDR	r9, [%[b], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[9] * B[5] */
        "LDR	r8, [%[a], #36]\n\t"
        "LDR	r9, [%[b], #20]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[8] * B[6] */
        "LDR	r8, [%[a], #32]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[7] * B[7] */
        "LDR	r11, [%[a], #28]\n\t"
        "LDR	r12, [%[b], #28]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[6] * B[8] */
        "LDR	r8, [%[a], #24]\n\t"
        "LDR	r9, [%[b], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[5] * B[9] */
        "LDR	r8, [%[a], #20]\n\t"
        "LDR	r9, [%[b], #36]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[4] * B[10] */
        "LDR	r8, [%[a], #16]\n\t"
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[3] * B[11] */
        "LDR	r8, [%[a], #12]\n\t"
        "LDR	r9, [%[b], #44]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r5, [%[r], #56]\n\t"
        /* A[4] * B[11] */
        "LDR	r8, [%[a], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "MOV	r5, #0x0\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[5] * B[10] */
        "LDR	r8, [%[a], #20]\n\t"
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[6] * B[9] */
        "LDR	r8, [%[a], #24]\n\t"
        "LDR	r9, [%[b], #36]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[7] * B[8] */
        "LDR	r9, [%[b], #32]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[8] * B[7] */
        "LDR	r8, [%[a], #32]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[9] * B[6] */
        "LDR	r8, [%[a], #36]\n\t"
        "LDR	r9, [%[b], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[10] * B[5] */
        "LDR	r8, [%[a], #40]\n\t"
        "LDR	r9, [%[b], #20]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[11] * B[4] */
        "LDR	r8, [%[a], #44]\n\t"
        "LDR	r9, [%[b], #16]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [%[r], #60]\n\t"
        /* A[11] * B[5] */
        "LDR	r9, [%[b], #20]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[10] * B[6] */
        "LDR	r8, [%[a], #40]\n\t"
        "LDR	r9, [%[b], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[9] * B[7] */
        "LDR	r8, [%[a], #36]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[8] * B[8] */
        "LDR	r11, [%[a], #32]\n\t"
        "LDR	r12, [%[b], #32]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[7] * B[9] */
        "LDR	r8, [%[a], #28]\n\t"
        "LDR	r9, [%[b], #36]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[6] * B[10] */
        "LDR	r8, [%[a], #24]\n\t"
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[5] * B[11] */
        "LDR	r8, [%[a], #20]\n\t"
        "LDR	r9, [%[b], #44]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "STR	r4, [%[r], #64]\n\t"
        /* A[6] * B[11] */
        "LDR	r8, [%[a], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[7] * B[10] */
        "LDR	r8, [%[a], #28]\n\t"
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[8] * B[9] */
        "LDR	r9, [%[b], #36]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[9] * B[8] */
        "LDR	r8, [%[a], #36]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[10] * B[7] */
        "LDR	r8, [%[a], #40]\n\t"
        "LDR	r9, [%[b], #28]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[11] * B[6] */
        "LDR	r8, [%[a], #44]\n\t"
        "LDR	r9, [%[b], #24]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r5, [%[r], #68]\n\t"
        /* A[11] * B[7] */
        "LDR	r9, [%[b], #28]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "MOV	r5, #0x0\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[10] * B[8] */
        "LDR	r8, [%[a], #40]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[9] * B[9] */
        "LDR	r11, [%[a], #36]\n\t"
        "LDR	r12, [%[b], #36]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[8] * B[10] */
        "LDR	r8, [%[a], #32]\n\t"
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[7] * B[11] */
        "LDR	r8, [%[a], #28]\n\t"
        "LDR	r9, [%[b], #44]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [%[r], #72]\n\t"
        /* A[8] * B[11] */
        "LDR	r8, [%[a], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[9] * B[10] */
        "LDR	r9, [%[b], #40]\n\t"
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[10] * B[9] */
        "LDR	r8, [%[a], #40]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[11] * B[8] */
        "LDR	r8, [%[a], #44]\n\t"
        "LDR	r9, [%[b], #32]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r4, r4, r6\n\t"
        "ADCS	r5, r5, r7\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "STR	r4, [%[r], #76]\n\t"
        /* A[11] * B[9] */
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[10] * B[10] */
        "LDR	r11, [%[a], #40]\n\t"
        "LDR	r12, [%[b], #40]\n\t"
        "UMULL	r6, r7, r11, r12\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[9] * B[11] */
        "LDR	r8, [%[a], #36]\n\t"
        "LDR	r9, [%[b], #44]\n\t"
        "UMULL	r6, r7, r8, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r5, [%[r], #80]\n\t"
        /* A[10] * B[11] */
        "UMULL	r6, r7, r11, r9\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "MOV	r5, #0x0\n\t"
        "ADC	r5, r5, #0x0\n\t"
        /* A[11] * B[10] */
        "LDR	r8, [%[a], #44]\n\t"
        "UMULL	r6, r7, r8, r12\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [%[r], #84]\n\t"
        /* A[11] * B[11] */
        "UMLAL	r4, r5, r8, r9\n\t"
        "STR	r4, [%[r], #88]\n\t"
        "STR	r5, [%[r], #92]\n\t"
        "LDM	sp!, {r3, r4, r5, r6}\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	sp!, {r3, r4, r5, r6}\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	sp!, {r3, r4, r5, r6}\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12", "cc"
    );
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADDS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "MOV	%[r], #0x0\n\t"
        "ADC	%[r], %[r], #0x0\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_in_place_24(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_in_place_24(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SUBS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_add_24(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADDS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "MOV	%[r], #0x0\n\t"
        "ADC	%[r], %[r], #0x0\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<12; i++) {
        r[i] = a[i] & m;
    }
#else
    r[0] = a[0] & m;
    r[1] = a[1] & m;
    r[2] = a[2] & m;
    r[3] = a[3] & m;
    r[4] = a[4] & m;
    r[5] = a[5] & m;
    r[6] = a[6] & m;
    r[7] = a[7] & m;
    r[8] = a[8] & m;
    r[9] = a[9] & m;
    r[10] = a[10] & m;
    r[11] = a[11] & m;
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[24];
    sp_digit a1[12];
    sp_digit b1[12];
    sp_digit* z2 = r + 24;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_3072_add_12(a1, a, &a[12]);
    cb = sp_3072_add_12(b1, b, &b[12]);
    u  = ca & cb;

    sp_3072_mul_12(z2, &a[12], &b[12]);
    sp_3072_mul_12(z0, a, b);
    sp_3072_mul_12(z1, a1, b1);

    u += sp_3072_sub_in_place_24(z1, z0);
    u += sp_3072_sub_in_place_24(z1, z2);
    sp_3072_mask_12(a1, a1, 0 - cb);
    u += sp_3072_add_12(z1 + 12, z1 + 12, a1);
    sp_3072_mask_12(b1, b1, 0 - ca);
    u += sp_3072_add_12(z1 + 12, z1 + 12, b1);

    u += sp_3072_add_24(r + 12, r + 12, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (12 - 1));
    a1[0] = u;
    (void)sp_3072_add_12(r + 36, r + 36, a1);
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SUBS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADDS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "MOV	%[r], #0x0\n\t"
        "ADC	%[r], %[r], #0x0\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<24; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[48];
    sp_digit a1[24];
    sp_digit b1[24];
    sp_digit* z2 = r + 48;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_3072_add_24(a1, a, &a[24]);
    cb = sp_3072_add_24(b1, b, &b[24]);
    u  = ca & cb;

    sp_3072_mul_24(z2, &a[24], &b[24]);
    sp_3072_mul_24(z0, a, b);
    sp_3072_mul_24(z1, a1, b1);

    u += sp_3072_sub_in_place_48(z1, z0);
    u += sp_3072_sub_in_place_48(z1, z2);
    sp_3072_mask_24(a1, a1, 0 - cb);
    u += sp_3072_add_24(z1 + 24, z1 + 24, a1);
    sp_3072_mask_24(b1, b1, 0 - ca);
    u += sp_3072_add_24(z1 + 24, z1 + 24, b1);

    u += sp_3072_add_48(r + 24, r + 24, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (24 - 1));
    a1[0] = u;
    (void)sp_3072_add_24(r + 72, r + 72, a1);
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SUBS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADDS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "MOV	%[r], #0x0\n\t"
        "ADC	%[r], %[r], #0x0\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<48; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[96];
    sp_digit a1[48];
    sp_digit b1[48];
    sp_digit* z2 = r + 96;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_3072_add_48(a1, a, &a[48]);
    cb = sp_3072_add_48(b1, b, &b[48]);
    u  = ca & cb;

    sp_3072_mul_48(z2, &a[48], &b[48]);
    sp_3072_mul_48(z0, a, b);
    sp_3072_mul_48(z1, a1, b1);

    u += sp_3072_sub_in_place_96(z1, z0);
    u += sp_3072_sub_in_place_96(z1, z2);
    sp_3072_mask_48(a1, a1, 0 - cb);
    u += sp_3072_add_48(z1 + 48, z1 + 48, a1);
    sp_3072_mask_48(b1, b1, 0 - ca);
    u += sp_3072_add_48(z1 + 48, z1 + 48, b1);

    u += sp_3072_add_96(r + 48, r + 48, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (48 - 1));
    a1[0] = u;
    (void)sp_3072_add_48(r + 144, r + 144, a1);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p)
#else
static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x30\n\t"
        /* A[0] * A[0] */
        "LDR	r10, [%[a]]\n\t"
        "UMULL	r8, r3, r10, r10\n\t"
        "MOV	r4, #0x0\n\t"
        "STR	r8, [sp]\n\t"
        /* A[0] * A[1] */
        "LDR	r10, [%[a], #4]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "MOV	r2, #0x0\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "MOV	r2, #0x0\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "STR	r3, [sp, #4]\n\t"
        /* A[0] * A[2] */
        "LDR	r10, [%[a], #8]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r4, r4, r8\n\t"
        "ADCS	r2, r2, r9\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r4, r4, r8\n\t"
        "ADCS	r2, r2, r9\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[1] * A[1] */
        "LDR	r10, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r4, r4, r8\n\t"
        "ADCS	r2, r2, r9\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "STR	r4, [sp, #8]\n\t"
        /* A[0] * A[3] */
        "LDR	r10, [%[a], #12]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[1] * A[2] */
        "LDR	r10, [%[a], #8]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r2, [sp, #12]\n\t"
        /* A[0] * A[4] */
        "LDR	r10, [%[a], #16]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "MOV	r2, #0x0\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "MOV	r2, #0x0\n\t"
        "ADC	r2, r2, #0x0\n\t"
        /* A[1] * A[3] */
        "LDR	r10, [%[a], #12]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "ADC	r2, r2, #0x0\n\t"
        /* A[2] * A[2] */
        "LDR	r10, [%[a], #8]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "STR	r3, [sp, #16]\n\t"
        /* A[0] * A[5] */
        "LDR	r10, [%[a], #20]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r3, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[1] * A[4] */
        "LDR	r10, [%[a], #16]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[2] * A[3] */
        "LDR	r10, [%[a], #12]\n\t"
        "LDR	r12, [%[a], #8]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r4, r4, r5\n\t"
        "ADCS	r2, r2, r6\n\t"
        "ADC	r3, r3, r7\n\t"
        "STR	r4, [sp, #20]\n\t"
        /* A[0] * A[6] */
        "LDR	r10, [%[a], #24]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r4, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[1] * A[5] */
        "LDR	r10, [%[a], #20]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[2] * A[4] */
        "LDR	r10, [%[a], #16]\n\t"
        "LDR	r12, [%[a], #8]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[3] * A[3] */
        "LDR	r10, [%[a], #12]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "ADDS	r2, r2, r5\n\t"
        "ADCS	r3, r3, r6\n\t"
        "ADC	r4, r4, r7\n\t"
        "STR	r2, [sp, #24]\n\t"
        /* A[0] * A[7] */
        "LDR	r10, [%[a], #28]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r2, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[1] * A[6] */
        "LDR	r10, [%[a], #24]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[2] * A[5] */
        "LDR	r10, [%[a], #20]\n\t"
        "LDR	r12, [%[a], #8]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[3] * A[4] */
        "LDR	r10, [%[a], #16]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r3, r3, r5\n\t"
        "ADCS	r4, r4, r6\n\t"
        "ADC	r2, r2, r7\n\t"
        "STR	r3, [sp, #28]\n\t"
        /* A[0] * A[8] */
        "LDR	r10, [%[a], #32]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r3, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[1] * A[7] */
        "LDR	r10, [%[a], #28]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[2] * A[6] */
        "LDR	r10, [%[a], #24]\n\t"
        "LDR	r12, [%[a], #8]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[3] * A[5] */
        "LDR	r10, [%[a], #20]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[4] * A[4] */
        "LDR	r10, [%[a], #16]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r4, r4, r8\n\t"
        "ADCS	r2, r2, r9\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r4, r4, r5\n\t"
        "ADCS	r2, r2, r6\n\t"
        "ADC	r3, r3, r7\n\t"
        "STR	r4, [sp, #32]\n\t"
        /* A[0] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r4, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[1] * A[8] */
        "LDR	r10, [%[a], #32]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[2] * A[7] */
        "LDR	r10, [%[a], #28]\n\t"
        "LDR	r12, [%[a], #8]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[3] * A[6] */
        "LDR	r10, [%[a], #24]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[4] * A[5] */
        "LDR	r10, [%[a], #20]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r2, r2, r5\n\t"
        "ADCS	r3, r3, r6\n\t"
        "ADC	r4, r4, r7\n\t"
        "STR	r2, [sp, #36]\n\t"
        /* A[0] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r2, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[1] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[2] * A[8] */
        "LDR	r10, [%[a], #32]\n\t"
        "LDR	r12, [%[a], #8]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[3] * A[7] */
        "LDR	r10, [%[a], #28]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[4] * A[6] */
        "LDR	r10, [%[a], #24]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[5] * A[5] */
        "LDR	r10, [%[a], #20]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "ADDS	r3, r3, r5\n\t"
        "ADCS	r4, r4, r6\n\t"
        "ADC	r2, r2, r7\n\t"
        "STR	r3, [sp, #40]\n\t"
        /* A[0] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a]]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r3, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[1] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[2] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "LDR	r12, [%[a], #8]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[3] * A[8] */
        "LDR	r10, [%[a], #32]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[4] * A[7] */
        "LDR	r10, [%[a], #28]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[5] * A[6] */
        "LDR	r10, [%[a], #24]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r4, r4, r5\n\t"
        "ADCS	r2, r2, r6\n\t"
        "ADC	r3, r3, r7\n\t"
        "STR	r4, [sp, #44]\n\t"
        /* A[1] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #4]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r4, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[2] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a], #8]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[3] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[4] * A[8] */
        "LDR	r10, [%[a], #32]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[5] * A[7] */
        "LDR	r10, [%[a], #28]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[6] * A[6] */
        "LDR	r10, [%[a], #24]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "ADDS	r2, r2, r5\n\t"
        "ADCS	r3, r3, r6\n\t"
        "ADC	r4, r4, r7\n\t"
        "STR	r2, [%[r], #48]\n\t"
        /* A[2] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #8]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r2, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[3] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[4] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[5] * A[8] */
        "LDR	r10, [%[a], #32]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[6] * A[7] */
        "LDR	r10, [%[a], #28]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r3, r3, r5\n\t"
        "ADCS	r4, r4, r6\n\t"
        "ADC	r2, r2, r7\n\t"
        "STR	r3, [%[r], #52]\n\t"
        /* A[3] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r3, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[4] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[5] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[6] * A[8] */
        "LDR	r10, [%[a], #32]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[7] * A[7] */
        "LDR	r10, [%[a], #28]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r4, r4, r8\n\t"
        "ADCS	r2, r2, r9\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r4, r4, r5\n\t"
        "ADCS	r2, r2, r6\n\t"
        "ADC	r3, r3, r7\n\t"
        "STR	r4, [%[r], #56]\n\t"
        /* A[4] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r4, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[5] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[6] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[7] * A[8] */
        "LDR	r10, [%[a], #32]\n\t"
        "LDR	r12, [%[a], #28]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r2, r2, r5\n\t"
        "ADCS	r3, r3, r6\n\t"
        "ADC	r4, r4, r7\n\t"
        "STR	r2, [%[r], #60]\n\t"
        /* A[5] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r2, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[6] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[7] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "LDR	r12, [%[a], #28]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[8] * A[8] */
        "LDR	r10, [%[a], #32]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "ADDS	r3, r3, r5\n\t"
        "ADCS	r4, r4, r6\n\t"
        "ADC	r2, r2, r7\n\t"
        "STR	r3, [%[r], #64]\n\t"
        /* A[6] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "UMULL	r5, r6, r10, r12\n\t"
        "MOV	r3, #0x0\n\t"
        "MOV	r7, #0x0\n\t"
        /* A[7] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a], #28]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* A[8] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "LDR	r12, [%[a], #32]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r5, r5, r8\n\t"
        "ADCS	r6, r6, r9\n\t"
        "ADC	r7, r7, #0x0\n\t"
        "ADDS	r5, r5, r5\n\t"
        "ADCS	r6, r6, r6\n\t"
        "ADC	r7, r7, r7\n\t"
        "ADDS	r4, r4, r5\n\t"
        "ADCS	r2, r2, r6\n\t"
        "ADC	r3, r3, r7\n\t"
        "STR	r4, [%[r], #68]\n\t"
        /* A[7] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #28]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[8] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a], #32]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "ADC	r4, r4, #0x0\n\t"
        /* A[9] * A[9] */
        "LDR	r10, [%[a], #36]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r2, [%[r], #72]\n\t"
        /* A[8] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #32]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "MOV	r2, #0x0\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "MOV	r2, #0x0\n\t"
        "ADC	r2, r2, #0x0\n\t"
        /* A[9] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "LDR	r12, [%[a], #36]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "ADDS	r3, r3, r8\n\t"
        "ADCS	r4, r4, r9\n\t"
        "ADC	r2, r2, #0x0\n\t"
        "STR	r3, [%[r], #76]\n\t"
        /* A[9] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #36]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r4, r4, r8\n\t"
        "ADCS	r2, r2, r9\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r4, r4, r8\n\t"
        "ADCS	r2, r2, r9\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* A[10] * A[10] */
        "LDR	r10, [%[a], #40]\n\t"
        "UMULL	r8, r9, r10, r10\n\t"
        "ADDS	r4, r4, r8\n\t"
        "ADCS	r2, r2, r9\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "STR	r4, [%[r], #80]\n\t"
        /* A[10] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "LDR	r12, [%[a], #40]\n\t"
        "UMULL	r8, r9, r10, r12\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "ADDS	r2, r2, r8\n\t"
        "ADCS	r3, r3, r9\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r4, r4, #0x0\n\t"
        "STR	r2, [%[r], #84]\n\t"
        /* A[11] * A[11] */
        "LDR	r10, [%[a], #44]\n\t"
        "UMLAL	r3, r4, r10, r10\n\t"
        "STR	r3, [%[r], #88]\n\t"
        "STR	r4, [%[r], #92]\n\t"
        "LDM	sp!, {r2, r3, r4, r8}\n\t"
        "STM	%[r]!, {r2, r3, r4, r8}\n\t"
        "LDM	sp!, {r2, r3, r4, r8}\n\t"
        "STM	%[r]!, {r2, r3, r4, r8}\n\t"
        "LDM	sp!, {r2, r3, r4, r8}\n\t"
        "STM	%[r]!, {r2, r3, r4, r8}\n\t"
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "cc"
    );
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SUBS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 24;
    sp_digit z1[24];
    sp_digit* a1 = z1;
    sp_digit zero[12];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 12);

    mask = sp_3072_sub_12(a1, a, &a[12]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_3072_sub_12(a1, p1, p2);

    sp_3072_sqr_12(z2, &a[12]);
    sp_3072_sqr_12(z0, a);
    sp_3072_sqr_12(z1, a1);

    u = 0;
    u -= sp_3072_sub_in_place_24(z1, z2);
    u -= sp_3072_sub_in_place_24(z1, z0);
    u += sp_3072_sub_in_place_24(r + 12, z1);
    zero[0] = u;
    (void)sp_3072_add_12(r + 36, r + 36, zero);
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_24(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_24(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SUBS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 48;
    sp_digit z1[48];
    sp_digit* a1 = z1;
    sp_digit zero[24];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 24);

    mask = sp_3072_sub_24(a1, a, &a[24]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_3072_sub_24(a1, p1, p2);

    sp_3072_sqr_24(z2, &a[24]);
    sp_3072_sqr_24(z0, a);
    sp_3072_sqr_24(z1, a1);

    u = 0;
    u -= sp_3072_sub_in_place_48(z1, z2);
    u -= sp_3072_sub_in_place_48(z1, z0);
    u += sp_3072_sub_in_place_48(r + 24, z1);
    zero[0] = u;
    (void)sp_3072_add_24(r + 72, r + 72, zero);
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SUBS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 96;
    sp_digit z1[96];
    sp_digit* a1 = z1;
    sp_digit zero[48];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 48);

    mask = sp_3072_sub_48(a1, a, &a[48]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_3072_sub_48(a1, p1, p2);

    sp_3072_sqr_48(z2, &a[48]);
    sp_3072_sqr_48(z0, a);
    sp_3072_sqr_48(z1, a1);

    u = 0;
    u -= sp_3072_sub_in_place_96(z1, z2);
    u -= sp_3072_sub_in_place_96(z1, z0);
    u += sp_3072_sub_in_place_96(r + 48, z1);
    zero[0] = u;
    (void)sp_3072_add_48(r + 144, r + 144, zero);
}

#endif /* !WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r3, #0x0\n\t"
        "ADD	r12, %[a], #0x180\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_add_96_word:\n\t"
#else
    "L_sp_3072_add_96_word_%=:\n\t"
#endif
        "ADDS	r3, r3, #0xffffffff\n\t"
        "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "ADCS	r7, r7, r11\n\t"
        "STM	%[r]!, {r4, r5, r6, r7}\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r3, r4, #0x0\n\t"
        "CMP	%[a], r12\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_3072_add_96_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_3072_add_96_word\n\t"
#else
        "BNE.N	L_sp_3072_add_96_word_%=\n\t"
#endif
        "MOV	%[r], r3\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r10, #0x0\n\t"
        "ADD	r11, %[a], #0x180\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sub_in_pkace_96_word:\n\t"
#else
    "L_sp_3072_sub_in_pkace_96_word_%=:\n\t"
#endif
        "RSBS	r10, r10, #0x0\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	r10, r10, r10\n\t"
        "CMP	%[a], r11\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_3072_sub_in_pkace_96_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_3072_sub_in_pkace_96_word\n\t"
#else
        "BNE.N	L_sp_3072_sub_in_pkace_96_word_%=\n\t"
#endif
        "MOV	%[a], r10\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
    );
    return (word32)(size_t)a;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x300\n\t"
        "LDR	lr, [%[a]]\n\t"
        "LDR	r11, [%[b]]\n\t"
        "UMULL	r8, r6, lr, r11\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_96_outer:\n\t"
#else
    "L_sp_3072_mul_96_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0x17c\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_96_inner:\n\t"
#else
    "L_sp_3072_mul_96_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "LDR	lr, [%[a], r4]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_3072_mul_96_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_3072_mul_96_inner_done\n\t"
#else
        "BGT.N	L_sp_3072_mul_96_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_3072_mul_96_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mul_96_inner\n\t"
#else
        "BLT.N	L_sp_3072_mul_96_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_96_inner_done:\n\t"
#else
    "L_sp_3072_mul_96_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x2f4\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_3072_mul_96_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_3072_mul_96_outer\n\t"
#else
        "BLE.N	L_sp_3072_mul_96_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #380]\n\t"
        "LDR	r11, [%[b], #380]\n\t"
        "UMLAL	r6, r7, lr, r11\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_96_store:\n\t"
#else
    "L_sp_3072_mul_96_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_3072_mul_96_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_3072_mul_96_store\n\t"
#else
        "BGT.N	L_sp_3072_mul_96_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p)
#else
static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x300\n\t"
        "LDR	lr, [%[a]]\n\t"
        "UMULL	r8, r6, lr, lr\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sqr_96_outer:\n\t"
#else
    "L_sp_3072_sqr_96_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0x17c\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sqr_96_inner:\n\t"
#else
    "L_sp_3072_sqr_96_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[a], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_3072_sqr_96_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_3072_sqr_96_inner_done\n\t"
#else
        "BGT.N	L_sp_3072_sqr_96_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_3072_sqr_96_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_sqr_96_inner\n\t"
#else
        "BLT.N	L_sp_3072_sqr_96_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "UMULL	r9, r10, lr, lr\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sqr_96_inner_done:\n\t"
#else
    "L_sp_3072_sqr_96_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x2f4\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_3072_sqr_96_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_3072_sqr_96_outer\n\t"
#else
        "BLE.N	L_sp_3072_sqr_96_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #380]\n\t"
        "UMLAL	r6, r7, lr, lr\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sqr_96_store:\n\t"
#else
    "L_sp_3072_sqr_96_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_3072_sqr_96_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_3072_sqr_96_store\n\t"
#else
        "BGT.N	L_sp_3072_sqr_96_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#ifdef WOLFSSL_SP_SMALL
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m)
{
    int i;

    for (i=0; i<48; i++) {
        r[i] = a[i] & m;
    }
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r3, #0x0\n\t"
        "ADD	r12, %[a], #0xc0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_add_48_word:\n\t"
#else
    "L_sp_3072_add_48_word_%=:\n\t"
#endif
        "ADDS	r3, r3, #0xffffffff\n\t"
        "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "ADCS	r7, r7, r11\n\t"
        "STM	%[r]!, {r4, r5, r6, r7}\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r3, r4, #0x0\n\t"
        "CMP	%[a], r12\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_3072_add_48_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_3072_add_48_word\n\t"
#else
        "BNE.N	L_sp_3072_add_48_word_%=\n\t"
#endif
        "MOV	%[r], r3\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r10, #0x0\n\t"
        "ADD	r11, %[a], #0xc0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sub_in_pkace_48_word:\n\t"
#else
    "L_sp_3072_sub_in_pkace_48_word_%=:\n\t"
#endif
        "RSBS	r10, r10, #0x0\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	r10, r10, r10\n\t"
        "CMP	%[a], r11\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_3072_sub_in_pkace_48_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_3072_sub_in_pkace_48_word\n\t"
#else
        "BNE.N	L_sp_3072_sub_in_pkace_48_word_%=\n\t"
#endif
        "MOV	%[a], r10\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
    );
    return (word32)(size_t)a;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x180\n\t"
        "LDR	lr, [%[a]]\n\t"
        "LDR	r11, [%[b]]\n\t"
        "UMULL	r8, r6, lr, r11\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_48_outer:\n\t"
#else
    "L_sp_3072_mul_48_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0xbc\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_48_inner:\n\t"
#else
    "L_sp_3072_mul_48_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "LDR	lr, [%[a], r4]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_3072_mul_48_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_3072_mul_48_inner_done\n\t"
#else
        "BGT.N	L_sp_3072_mul_48_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_3072_mul_48_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mul_48_inner\n\t"
#else
        "BLT.N	L_sp_3072_mul_48_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_48_inner_done:\n\t"
#else
    "L_sp_3072_mul_48_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x174\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_3072_mul_48_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_3072_mul_48_outer\n\t"
#else
        "BLE.N	L_sp_3072_mul_48_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #188]\n\t"
        "LDR	r11, [%[b], #188]\n\t"
        "UMLAL	r6, r7, lr, r11\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_48_store:\n\t"
#else
    "L_sp_3072_mul_48_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_3072_mul_48_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_3072_mul_48_store\n\t"
#else
        "BGT.N	L_sp_3072_mul_48_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p)
#else
static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x180\n\t"
        "LDR	lr, [%[a]]\n\t"
        "UMULL	r8, r6, lr, lr\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sqr_48_outer:\n\t"
#else
    "L_sp_3072_sqr_48_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0xbc\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sqr_48_inner:\n\t"
#else
    "L_sp_3072_sqr_48_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[a], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_3072_sqr_48_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_3072_sqr_48_inner_done\n\t"
#else
        "BGT.N	L_sp_3072_sqr_48_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_3072_sqr_48_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_sqr_48_inner\n\t"
#else
        "BLT.N	L_sp_3072_sqr_48_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "UMULL	r9, r10, lr, lr\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sqr_48_inner_done:\n\t"
#else
    "L_sp_3072_sqr_48_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x174\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_3072_sqr_48_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_3072_sqr_48_outer\n\t"
#else
        "BLE.N	L_sp_3072_sqr_48_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #188]\n\t"
        "UMLAL	r6, r7, lr, lr\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sqr_48_store:\n\t"
#else
    "L_sp_3072_sqr_48_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_3072_sqr_48_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_3072_sqr_48_store\n\t"
#else
        "BGT.N	L_sp_3072_sqr_48_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

/* Calculate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */

    /* rho = -1/m mod b */
    *rho = (sp_digit)0 - x;
}

#ifdef WOLFSSL_SP_SMALL
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDR	r8, [%[a]]\n\t"
        "UMULL	r5, r3, %[b], r8\n\t"
        "MOV	r4, #0x0\n\t"
        "STR	r5, [%[r]]\n\t"
        "MOV	r5, #0x0\n\t"
        "MOV	r9, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_d_96_word:\n\t"
#else
    "L_sp_3072_mul_d_96_word_%=:\n\t"
#endif
        /* A[i] * B */
        "LDR	r8, [%[a], r9]\n\t"
        "UMULL	r6, r7, %[b], r8\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [%[r], r9]\n\t"
        "MOV	r3, r4\n\t"
        "MOV	r4, r5\n\t"
        "MOV	r5, #0x0\n\t"
        "ADD	r9, r9, #0x4\n\t"
        "CMP	r9, #0x180\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mul_d_96_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mul_d_96_word\n\t"
#else
        "BLT.N	L_sp_3072_mul_d_96_word_%=\n\t"
#endif
        "STR	r3, [%[r], #384]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
}

#else
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMULL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[1] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[2] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[3] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[4] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[5] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[6] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[7] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[8] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[9] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[10] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[11] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[12] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[13] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[14] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[15] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[16] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[17] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[18] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[19] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[20] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[21] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[22] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[23] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[24] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[25] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[26] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[27] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[28] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[29] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[30] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[31] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[32] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[33] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[34] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[35] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[36] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[37] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[38] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[39] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[40] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[41] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[42] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[43] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[44] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[45] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[46] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[47] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[48] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[49] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[50] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[51] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[52] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[53] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[54] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[55] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[56] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[57] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[58] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[59] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[60] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[61] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[62] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[63] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[64] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[65] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[66] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[67] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[68] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[69] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[70] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[71] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[72] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[73] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[74] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[75] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[76] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[77] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[78] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[79] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[80] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[81] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[82] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[83] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[84] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[85] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[86] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[87] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[88] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[89] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[90] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[91] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[92] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[93] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[94] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[95] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "STR	r3, [%[r]]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 48);

    /* r = 2^n mod m */
    sp_3072_sub_in_place_48(r, m);
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r8, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_cond_sub_48_words:\n\t"
#else
    "L_sp_3072_cond_sub_48_words_%=:\n\t"
#endif
        "SUBS	r4, r8, r4\n\t"
        "LDR	r6, [%[a], r5]\n\t"
        "LDR	r7, [%[b], r5]\n\t"
        "AND	r7, r7, %[m]\n\t"
        "SBCS	r6, r6, r7\n\t"
        "SBC	r4, r8, r8\n\t"
        "STR	r6, [%[r], r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0xc0\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_cond_sub_48_words_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_cond_sub_48_words\n\t"
#else
        "BLT.N	L_sp_3072_cond_sub_48_words_%=\n\t"
#endif
        "MOV	%[r], r4\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r5, #0x0\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SUBS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "SBC	%[r], r5, r5\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_NO_UMAAL
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	lr, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r11, #0x0\n\t"
        "MOV	r3, #0x0\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[a], #4]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_48_word:\n\t"
#else
    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	r10, %[mp], r4\n\t"
        /* a[i+0] += m[0] * mu */
        "MOV	r7, #0x0\n\t"
        "UMLAL	r4, r7, r10, lr\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r9, [%[m], #4]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r5, r6, r10, r9\n\t"
        "MOV	r4, r5\n\t"
        "ADDS	r4, r4, r7\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r9, [%[m], #8]\n\t"
        "LDR	r5, [%[a], #8]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r5, r7, r10, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r9, [%[m], #12]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #12]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r9, [%[m], #16]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #16]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r9, [%[m], #20]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #20]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r9, [%[m], #24]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #24]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r9, [%[m], #28]\n\t"
        "LDR	r12, [%[a], #28]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #28]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r9, [%[m], #32]\n\t"
        "LDR	r12, [%[a], #32]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #32]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r9, [%[m], #36]\n\t"
        "LDR	r12, [%[a], #36]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #36]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r9, [%[m], #40]\n\t"
        "LDR	r12, [%[a], #40]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #40]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r9, [%[m], #44]\n\t"
        "LDR	r12, [%[a], #44]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #44]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r9, [%[m], #48]\n\t"
        "LDR	r12, [%[a], #48]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #48]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r9, [%[m], #52]\n\t"
        "LDR	r12, [%[a], #52]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #52]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r9, [%[m], #56]\n\t"
        "LDR	r12, [%[a], #56]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #56]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r9, [%[m], #60]\n\t"
        "LDR	r12, [%[a], #60]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #60]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r9, [%[m], #64]\n\t"
        "LDR	r12, [%[a], #64]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #64]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r9, [%[m], #68]\n\t"
        "LDR	r12, [%[a], #68]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #68]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r9, [%[m], #72]\n\t"
        "LDR	r12, [%[a], #72]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #72]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r9, [%[m], #76]\n\t"
        "LDR	r12, [%[a], #76]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #76]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r9, [%[m], #80]\n\t"
        "LDR	r12, [%[a], #80]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #80]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r9, [%[m], #84]\n\t"
        "LDR	r12, [%[a], #84]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #84]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r9, [%[m], #88]\n\t"
        "LDR	r12, [%[a], #88]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #88]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r9, [%[m], #92]\n\t"
        "LDR	r12, [%[a], #92]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #92]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r9, [%[m], #96]\n\t"
        "LDR	r12, [%[a], #96]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #96]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r9, [%[m], #100]\n\t"
        "LDR	r12, [%[a], #100]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #100]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r9, [%[m], #104]\n\t"
        "LDR	r12, [%[a], #104]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #104]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r9, [%[m], #108]\n\t"
        "LDR	r12, [%[a], #108]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #108]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r9, [%[m], #112]\n\t"
        "LDR	r12, [%[a], #112]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #112]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r9, [%[m], #116]\n\t"
        "LDR	r12, [%[a], #116]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #116]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r9, [%[m], #120]\n\t"
        "LDR	r12, [%[a], #120]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #120]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r9, [%[m], #124]\n\t"
        "LDR	r12, [%[a], #124]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #124]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+32] += m[32] * mu */
        "LDR	r9, [%[m], #128]\n\t"
        "LDR	r12, [%[a], #128]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #128]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+33] += m[33] * mu */
        "LDR	r9, [%[m], #132]\n\t"
        "LDR	r12, [%[a], #132]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #132]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+34] += m[34] * mu */
        "LDR	r9, [%[m], #136]\n\t"
        "LDR	r12, [%[a], #136]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #136]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+35] += m[35] * mu */
        "LDR	r9, [%[m], #140]\n\t"
        "LDR	r12, [%[a], #140]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #140]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+36] += m[36] * mu */
        "LDR	r9, [%[m], #144]\n\t"
        "LDR	r12, [%[a], #144]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #144]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+37] += m[37] * mu */
        "LDR	r9, [%[m], #148]\n\t"
        "LDR	r12, [%[a], #148]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #148]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+38] += m[38] * mu */
        "LDR	r9, [%[m], #152]\n\t"
        "LDR	r12, [%[a], #152]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #152]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+39] += m[39] * mu */
        "LDR	r9, [%[m], #156]\n\t"
        "LDR	r12, [%[a], #156]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #156]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+40] += m[40] * mu */
        "LDR	r9, [%[m], #160]\n\t"
        "LDR	r12, [%[a], #160]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #160]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+41] += m[41] * mu */
        "LDR	r9, [%[m], #164]\n\t"
        "LDR	r12, [%[a], #164]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #164]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+42] += m[42] * mu */
        "LDR	r9, [%[m], #168]\n\t"
        "LDR	r12, [%[a], #168]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #168]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+43] += m[43] * mu */
        "LDR	r9, [%[m], #172]\n\t"
        "LDR	r12, [%[a], #172]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #172]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+44] += m[44] * mu */
        "LDR	r9, [%[m], #176]\n\t"
        "LDR	r12, [%[a], #176]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #176]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+45] += m[45] * mu */
        "LDR	r9, [%[m], #180]\n\t"
        "LDR	r12, [%[a], #180]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #180]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+46] += m[46] * mu */
        "LDR	r9, [%[m], #184]\n\t"
        "LDR	r12, [%[a], #184]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #184]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+47] += m[47] * mu */
        "LDR	r9, [%[m], #188]\n\t"
        "LDR	r12, [%[a], #188]\n\t"
        "UMULL	r8, r9, r10, r9\n\t"
        "ADDS	r7, r7, r8\n\t"
        "ADCS	r6, r9, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, r3\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #188]\n\t"
        "LDR	r12, [%[a], #192]\n\t"
        "ADCS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #192]\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* i += 1 */
        "ADD	r11, r11, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r11, #0xc0\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_48_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_3072_mont_reduce_48_word\n\t"
#else
        "BLT.W	L_sp_3072_mont_reduce_48_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r4, [%[a]]\n\t"
        "STR	r5, [%[a], #4]\n\t"
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_48_word:\n\t"
#else
    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_48_mul:\n\t"
#else
    "L_sp_3072_mont_reduce_48_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0xc0\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_48_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mont_reduce_48_mul\n\t"
#else
        "BLT.N	L_sp_3072_mont_reduce_48_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #192]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #192]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0xc0\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_48_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mont_reduce_48_word\n\t"
#else
        "BLT.N	L_sp_3072_mont_reduce_48_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#else
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* i = 0 */
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "LDR	r6, [%[a]]\n\t"
        "LDR	r7, [%[a], #4]\n\t"
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[a], #12]\n\t"
        "LDR	r10, [%[a], #16]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_48_word:\n\t"
#else
    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	lr, %[mp], r6\n\t"
        /* a[i+0] += m[0] * mu */
        "LDR	r12, [%[m]]\n\t"
        "MOV	r3, #0x0\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r12, [%[m], #4]\n\t"
        "MOV	r6, r7\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r12, [%[m], #8]\n\t"
        "MOV	r7, r8\n\t"
        "UMAAL	r7, r3, lr, r12\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r12, [%[m], #12]\n\t"
        "MOV	r8, r9\n\t"
        "UMAAL	r8, r3, lr, r12\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r12, [%[m], #16]\n\t"
        "MOV	r9, r10\n\t"
        "UMAAL	r9, r3, lr, r12\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r12, [%[m], #20]\n\t"
        "LDR	r10, [%[a], #20]\n\t"
        "UMAAL	r10, r3, lr, r12\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r12, [%[m], #24]\n\t"
        "LDR	r11, [%[a], #24]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #24]\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r12, [%[m], #28]\n\t"
        "LDR	r11, [%[a], #28]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #28]\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r12, [%[m], #32]\n\t"
        "LDR	r11, [%[a], #32]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #32]\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r12, [%[m], #36]\n\t"
        "LDR	r11, [%[a], #36]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #36]\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r12, [%[m], #40]\n\t"
        "LDR	r11, [%[a], #40]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #40]\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r12, [%[m], #44]\n\t"
        "LDR	r11, [%[a], #44]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #44]\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r12, [%[m], #48]\n\t"
        "LDR	r11, [%[a], #48]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #48]\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r12, [%[m], #52]\n\t"
        "LDR	r11, [%[a], #52]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #52]\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r12, [%[m], #56]\n\t"
        "LDR	r11, [%[a], #56]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #56]\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r12, [%[m], #60]\n\t"
        "LDR	r11, [%[a], #60]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #60]\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r12, [%[m], #64]\n\t"
        "LDR	r11, [%[a], #64]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #64]\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r12, [%[m], #68]\n\t"
        "LDR	r11, [%[a], #68]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #68]\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r12, [%[m], #72]\n\t"
        "LDR	r11, [%[a], #72]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #72]\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r12, [%[m], #76]\n\t"
        "LDR	r11, [%[a], #76]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #76]\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r12, [%[m], #80]\n\t"
        "LDR	r11, [%[a], #80]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #80]\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r12, [%[m], #84]\n\t"
        "LDR	r11, [%[a], #84]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #84]\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r12, [%[m], #88]\n\t"
        "LDR	r11, [%[a], #88]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #88]\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r12, [%[m], #92]\n\t"
        "LDR	r11, [%[a], #92]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #92]\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r12, [%[m], #96]\n\t"
        "LDR	r11, [%[a], #96]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #96]\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r12, [%[m], #100]\n\t"
        "LDR	r11, [%[a], #100]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #100]\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r12, [%[m], #104]\n\t"
        "LDR	r11, [%[a], #104]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #104]\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r12, [%[m], #108]\n\t"
        "LDR	r11, [%[a], #108]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #108]\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r12, [%[m], #112]\n\t"
        "LDR	r11, [%[a], #112]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #112]\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r12, [%[m], #116]\n\t"
        "LDR	r11, [%[a], #116]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #116]\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r12, [%[m], #120]\n\t"
        "LDR	r11, [%[a], #120]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #120]\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r12, [%[m], #124]\n\t"
        "LDR	r11, [%[a], #124]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #124]\n\t"
        /* a[i+32] += m[32] * mu */
        "LDR	r12, [%[m], #128]\n\t"
        "LDR	r11, [%[a], #128]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #128]\n\t"
        /* a[i+33] += m[33] * mu */
        "LDR	r12, [%[m], #132]\n\t"
        "LDR	r11, [%[a], #132]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #132]\n\t"
        /* a[i+34] += m[34] * mu */
        "LDR	r12, [%[m], #136]\n\t"
        "LDR	r11, [%[a], #136]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #136]\n\t"
        /* a[i+35] += m[35] * mu */
        "LDR	r12, [%[m], #140]\n\t"
        "LDR	r11, [%[a], #140]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #140]\n\t"
        /* a[i+36] += m[36] * mu */
        "LDR	r12, [%[m], #144]\n\t"
        "LDR	r11, [%[a], #144]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #144]\n\t"
        /* a[i+37] += m[37] * mu */
        "LDR	r12, [%[m], #148]\n\t"
        "LDR	r11, [%[a], #148]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #148]\n\t"
        /* a[i+38] += m[38] * mu */
        "LDR	r12, [%[m], #152]\n\t"
        "LDR	r11, [%[a], #152]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #152]\n\t"
        /* a[i+39] += m[39] * mu */
        "LDR	r12, [%[m], #156]\n\t"
        "LDR	r11, [%[a], #156]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #156]\n\t"
        /* a[i+40] += m[40] * mu */
        "LDR	r12, [%[m], #160]\n\t"
        "LDR	r11, [%[a], #160]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #160]\n\t"
        /* a[i+41] += m[41] * mu */
        "LDR	r12, [%[m], #164]\n\t"
        "LDR	r11, [%[a], #164]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #164]\n\t"
        /* a[i+42] += m[42] * mu */
        "LDR	r12, [%[m], #168]\n\t"
        "LDR	r11, [%[a], #168]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #168]\n\t"
        /* a[i+43] += m[43] * mu */
        "LDR	r12, [%[m], #172]\n\t"
        "LDR	r11, [%[a], #172]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #172]\n\t"
        /* a[i+44] += m[44] * mu */
        "LDR	r12, [%[m], #176]\n\t"
        "LDR	r11, [%[a], #176]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #176]\n\t"
        /* a[i+45] += m[45] * mu */
        "LDR	r12, [%[m], #180]\n\t"
        "LDR	r11, [%[a], #180]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #180]\n\t"
        /* a[i+46] += m[46] * mu */
        "LDR	r12, [%[m], #184]\n\t"
        "LDR	r11, [%[a], #184]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #184]\n\t"
        /* a[i+47] += m[47] * mu */
        "LDR	r12, [%[m], #188]\n\t"
        "LDR	r11, [%[a], #188]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "LDR	lr, [%[a], #192]\n\t"
        "MOV	r12, #0x0\n\t"
        "UMAAL	r3, lr, r12, r12\n\t"
        "STR	r11, [%[a], #188]\n\t"
        "ADDS	r3, r3, r5\n\t"
        "ADC	r5, lr, #0x0\n\t"
        "STR	r3, [%[a], #192]\n\t"
        /* i += 1 */
        "ADD	r4, r4, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r4, #0xc0\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_48_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_3072_mont_reduce_48_word\n\t"
#else
        "BLT.W	L_sp_3072_mont_reduce_48_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r6, [%[a]]\n\t"
        "STR	r7, [%[a], #4]\n\t"
        "STR	r8, [%[a], #8]\n\t"
        "STR	r9, [%[a], #12]\n\t"
        "STR	r10, [%[a], #16]\n\t"
        "MOV	%[mp], r5\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_48_word:\n\t"
#else
    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_48_mul:\n\t"
#else
    "L_sp_3072_mont_reduce_48_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0xc0\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_48_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mont_reduce_48_mul\n\t"
#else
        "BLT.N	L_sp_3072_mont_reduce_48_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #192]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #192]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0xc0\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_48_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mont_reduce_48_word\n\t"
#else
        "BLT.N	L_sp_3072_mont_reduce_48_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#endif
/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_mul_48(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_3072_mul_48(r, a, b);
    sp_3072_mont_reduce_48(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_3072_sqr_48(r, a);
    sp_3072_mont_reduce_48(r, m, mp);
}

#ifdef WOLFSSL_SP_SMALL
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDR	r8, [%[a]]\n\t"
        "UMULL	r5, r3, %[b], r8\n\t"
        "MOV	r4, #0x0\n\t"
        "STR	r5, [%[r]]\n\t"
        "MOV	r5, #0x0\n\t"
        "MOV	r9, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mul_d_48_word:\n\t"
#else
    "L_sp_3072_mul_d_48_word_%=:\n\t"
#endif
        /* A[i] * B */
        "LDR	r8, [%[a], r9]\n\t"
        "UMULL	r6, r7, %[b], r8\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [%[r], r9]\n\t"
        "MOV	r3, r4\n\t"
        "MOV	r4, r5\n\t"
        "MOV	r5, #0x0\n\t"
        "ADD	r9, r9, #0x4\n\t"
        "CMP	r9, #0xc0\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mul_d_48_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mul_d_48_word\n\t"
#else
        "BLT.N	L_sp_3072_mul_d_48_word_%=\n\t"
#endif
        "STR	r3, [%[r], #192]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
}

#else
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMULL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[1] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[2] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[3] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[4] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[5] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[6] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[7] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[8] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[9] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[10] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[11] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[12] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[13] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[14] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[15] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[16] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[17] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[18] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[19] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[20] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[21] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[22] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[23] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[24] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[25] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[26] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[27] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[28] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[29] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[30] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[31] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[32] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[33] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[34] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[35] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[36] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[37] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[38] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[39] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[40] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[41] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[42] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[43] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[44] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[45] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[46] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[47] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "STR	r3, [%[r]]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_USE_UDIV
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r8, %[div], #16\n\t"
        "ADD	r5, r8, #0x1\n\t"
        "UDIV	r6, %[d1], r5\n\t"
        "LSL	r7, %[div], #16\n\t"
        "LSL	r6, r6, #16\n\t"
        "UMULL	r3, r4, %[div], r6\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "SUBS	r3, %[d1], r5\n\t"
        "SBC	r9, r9, r9\n\t"
        "ADD	r9, r9, #0x1\n\t"
        "RSB	r10, r9, #0x0\n\t"
        "LSL	r9, r9, #16\n\t"
        "AND	r7, r7, r10\n\t"
        "AND	r8, r8, r10\n\t"
        "SUBS	%[d0], %[d0], r7\n\t"
        "ADD	r6, r6, r9\n\t"
        "SBC	%[d1], %[d1], r8\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "UMULL	r3, r4, %[div], r3\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "MUL	r3, %[div], r3\n\t"
        "SUB	%[d0], %[d0], r3\n\t"
        "UDIV	r3, %[d0], %[div]\n\t"
        "ADD	%[d1], r6, r3\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#else
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r5, %[div], #1\n\t"
        "ADD	r5, r5, #0x1\n\t"
        "MOV	r6, %[d0]\n\t"
        "MOV	r7, %[d1]\n\t"
        /* Do top 32 */
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "MOV	r3, #0x0\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        /* Next 30 bits */
        "MOV	r4, #0x1d\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_div_3072_word_48_bit:\n\t"
#else
    "L_div_3072_word_48_bit_%=:\n\t"
#endif
        "LSLS	r6, r6, #1\n\t"
        "ADC	r7, r7, r7\n\t"
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "ADD	r3, r3, r3\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        "SUBS	r4, r4, #0x1\n\t"
#if defined(__GNUC__)
        "BPL	L_div_3072_word_48_bit_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BPL.N	L_div_3072_word_48_bit\n\t"
#else
        "BPL.N	L_div_3072_word_48_bit_%=\n\t"
#endif
        "ADD	r3, r3, r3\n\t"
        "ADD	r3, r3, #0x1\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "SUBS	r8, %[div], r9\n\t"
        "SBC	r8, r8, r8\n\t"
        "SUB	%[d1], r3, r8\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#endif
/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_int32 sp_3072_cmp_48(const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r2, #0xffffffff\n\t"
        "MOV	r8, #0x1\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r3, #0xffffffff\n\t"
#ifdef WOLFSSL_SP_SMALL
        "MOV	r6, #0xbc\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_cmp_48_words:\n\t"
#else
    "L_sp_3072_cmp_48_words_%=:\n\t"
#endif
        "LDR	r4, [%[a], r6]\n\t"
        "LDR	r5, [%[b], r6]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "SUBS	r6, r6, #0x4\n\t"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "bcs	L_sp_3072_cmp_48_words\n\t"
#else
        "bcs	L_sp_3072_cmp_48_words_%=\n\t"
#endif
        "EOR	r2, r2, r3\n\t"
#else
        "LDR	r4, [%[a], #188]\n\t"
        "LDR	r5, [%[b], #188]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #184]\n\t"
        "LDR	r5, [%[b], #184]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #180]\n\t"
        "LDR	r5, [%[b], #180]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #176]\n\t"
        "LDR	r5, [%[b], #176]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #172]\n\t"
        "LDR	r5, [%[b], #172]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #168]\n\t"
        "LDR	r5, [%[b], #168]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #164]\n\t"
        "LDR	r5, [%[b], #164]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #160]\n\t"
        "LDR	r5, [%[b], #160]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #156]\n\t"
        "LDR	r5, [%[b], #156]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #152]\n\t"
        "LDR	r5, [%[b], #152]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #148]\n\t"
        "LDR	r5, [%[b], #148]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #144]\n\t"
        "LDR	r5, [%[b], #144]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #140]\n\t"
        "LDR	r5, [%[b], #140]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #136]\n\t"
        "LDR	r5, [%[b], #136]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #132]\n\t"
        "LDR	r5, [%[b], #132]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #128]\n\t"
        "LDR	r5, [%[b], #128]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #124]\n\t"
        "LDR	r5, [%[b], #124]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #120]\n\t"
        "LDR	r5, [%[b], #120]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #116]\n\t"
        "LDR	r5, [%[b], #116]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #112]\n\t"
        "LDR	r5, [%[b], #112]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #108]\n\t"
        "LDR	r5, [%[b], #108]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #104]\n\t"
        "LDR	r5, [%[b], #104]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #100]\n\t"
        "LDR	r5, [%[b], #100]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #96]\n\t"
        "LDR	r5, [%[b], #96]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #92]\n\t"
        "LDR	r5, [%[b], #92]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #88]\n\t"
        "LDR	r5, [%[b], #88]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #84]\n\t"
        "LDR	r5, [%[b], #84]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #80]\n\t"
        "LDR	r5, [%[b], #80]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #76]\n\t"
        "LDR	r5, [%[b], #76]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #72]\n\t"
        "LDR	r5, [%[b], #72]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #68]\n\t"
        "LDR	r5, [%[b], #68]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #64]\n\t"
        "LDR	r5, [%[b], #64]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #60]\n\t"
        "LDR	r5, [%[b], #60]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #56]\n\t"
        "LDR	r5, [%[b], #56]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #52]\n\t"
        "LDR	r5, [%[b], #52]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #48]\n\t"
        "LDR	r5, [%[b], #48]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #44]\n\t"
        "LDR	r5, [%[b], #44]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #40]\n\t"
        "LDR	r5, [%[b], #40]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #36]\n\t"
        "LDR	r5, [%[b], #36]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #32]\n\t"
        "LDR	r5, [%[b], #32]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #28]\n\t"
        "LDR	r5, [%[b], #28]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #24]\n\t"
        "LDR	r5, [%[b], #24]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #20]\n\t"
        "LDR	r5, [%[b], #20]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #16]\n\t"
        "LDR	r5, [%[b], #16]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #12]\n\t"
        "LDR	r5, [%[b], #12]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #8]\n\t"
        "LDR	r5, [%[b], #8]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #4]\n\t"
        "LDR	r5, [%[b], #4]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[b]]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "EOR	r2, r2, r3\n\t"
#endif /*WOLFSSL_SP_SMALL */
        "MOV	%[a], r2\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)a;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d,
        sp_digit* m, sp_digit* r)
{
    sp_digit t1[96], t2[49];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[47];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 48);
    r1 = sp_3072_cmp_48(&t1[48], d) >= 0;
    sp_3072_cond_sub_48(&t1[48], &t1[48], d, (sp_digit)0 - r1);
    for (i = 47; i >= 0; i--) {
        volatile sp_digit mask = (sp_digit)0 - (t1[48 + i] == div);
        sp_digit hi = t1[48 + i] + mask;
        r1 = div_3072_word_48(hi, t1[48 + i - 1], div);
        r1 |= mask;

        sp_3072_mul_d_48(t2, d, r1);
        t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2);
        t1[48 + i] -= t2[48];
        sp_3072_mask_48(t2, d, t1[48 + i]);
        t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], t2);
        sp_3072_mask_48(t2, d, t1[48 + i]);
        t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], t2);
    }

    r1 = sp_3072_cmp_48(t1, d) >= 0;
    sp_3072_cond_sub_48(r, t1, d, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_3072_mod_48(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_3072_div_48(a, m, NULL, r);
}

#ifdef WOLFSSL_SP_SMALL
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[16 * 96];
#endif
    sp_digit* t[16];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (16 * 96), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++) {
            t[i] = td + i * 96;
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_48(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 48U);
        if (reduceA != 0) {
            err = sp_3072_mod_48(t[1] + 48, a, m);
            if (err == MP_OKAY) {
                err = sp_3072_mod_48(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 48, a, sizeof(sp_digit) * 48);
            err = sp_3072_mod_48(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_48(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_48(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_48(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_48(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_48(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_48(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_48(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_48(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_48(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_48(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_48(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_48(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_48(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_48(t[15], t[ 8], t[ 7], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 4;
        if (c == 32) {
            c = 28;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 48);
        for (; i>=0 || c>=4; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 28);
                n <<= 4;
                c = 28;
            }
            else if (c < 4) {
                y = (byte)(n >> 28);
                n = e[i--];
                c = 4 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }

            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);

            sp_3072_mont_mul_48(r, r, t[y], m, mp);
        }

        XMEMSET(&r[48], 0, sizeof(sp_digit) * 48U);
        sp_3072_mont_reduce_48(r, m, mp);

        mask = (sp_digit)0 - (sp_3072_cmp_48(r, m) >= 0);
        sp_3072_cond_sub_48(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#else
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[32 * 96];
#endif
    sp_digit* t[32];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 96), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<32; i++) {
            t[i] = td + i * 96;
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_48(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 48U);
        if (reduceA != 0) {
            err = sp_3072_mod_48(t[1] + 48, a, m);
            if (err == MP_OKAY) {
                err = sp_3072_mod_48(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 48, a, sizeof(sp_digit) * 48);
            err = sp_3072_mod_48(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_48(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_48(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_48(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_48(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_48(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_48(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_48(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_48(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_48(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_48(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_48(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_48(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_48(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_48(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_48(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_48(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_48(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_48(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_48(t[20], t[10], m, mp);
        sp_3072_mont_mul_48(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_48(t[22], t[11], m, mp);
        sp_3072_mont_mul_48(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_48(t[24], t[12], m, mp);
        sp_3072_mont_mul_48(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_48(t[26], t[13], m, mp);
        sp_3072_mont_mul_48(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_48(t[28], t[14], m, mp);
        sp_3072_mont_mul_48(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_48(t[30], t[15], m, mp);
        sp_3072_mont_mul_48(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 5;
        if (c == 32) {
            c = 27;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 48);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 27);
                n <<= 5;
                c = 27;
            }
            else if (c < 5) {
                y = (byte)(n >> 27);
                n = e[i--];
                c = 5 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }

            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);

            sp_3072_mont_mul_48(r, r, t[y], m, mp);
        }

        XMEMSET(&r[48], 0, sizeof(sp_digit) * 48U);
        sp_3072_mont_reduce_48(r, m, mp);

        mask = (sp_digit)0 - (sp_3072_cmp_48(r, m) >= 0);
        sp_3072_cond_sub_48(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* WOLFSSL_SP_SMALL */

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_3072_mont_norm_96(sp_digit* r, const sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 96);

    /* r = 2^n mod m */
    sp_3072_sub_in_place_96(r, m);
}

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_SP_SMALL
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r8, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_cond_sub_96_words:\n\t"
#else
    "L_sp_3072_cond_sub_96_words_%=:\n\t"
#endif
        "SUBS	r4, r8, r4\n\t"
        "LDR	r6, [%[a], r5]\n\t"
        "LDR	r7, [%[b], r5]\n\t"
        "AND	r7, r7, %[m]\n\t"
        "SBCS	r6, r6, r7\n\t"
        "SBC	r4, r8, r8\n\t"
        "STR	r6, [%[r], r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x180\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_cond_sub_96_words_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_cond_sub_96_words\n\t"
#else
        "BLT.N	L_sp_3072_cond_sub_96_words_%=\n\t"
#endif
        "MOV	%[r], r4\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r5, #0x0\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SUBS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "SBC	%[r], r5, r5\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_NO_UMAAL
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	lr, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r11, #0x0\n\t"
        "MOV	r3, #0x0\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[a], #4]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_96_word:\n\t"
#else
    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	r10, %[mp], r4\n\t"
        /* a[i+0] += m[0] * mu */
        "MOV	r7, #0x0\n\t"
        "UMLAL	r4, r7, r10, lr\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r9, [%[m], #4]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r5, r6, r10, r9\n\t"
        "MOV	r4, r5\n\t"
        "ADDS	r4, r4, r7\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r9, [%[m], #8]\n\t"
        "LDR	r5, [%[a], #8]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r5, r7, r10, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r9, [%[m], #12]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #12]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r9, [%[m], #16]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #16]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r9, [%[m], #20]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #20]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r9, [%[m], #24]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #24]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r9, [%[m], #28]\n\t"
        "LDR	r12, [%[a], #28]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #28]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r9, [%[m], #32]\n\t"
        "LDR	r12, [%[a], #32]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #32]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r9, [%[m], #36]\n\t"
        "LDR	r12, [%[a], #36]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #36]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r9, [%[m], #40]\n\t"
        "LDR	r12, [%[a], #40]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #40]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r9, [%[m], #44]\n\t"
        "LDR	r12, [%[a], #44]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #44]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r9, [%[m], #48]\n\t"
        "LDR	r12, [%[a], #48]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #48]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r9, [%[m], #52]\n\t"
        "LDR	r12, [%[a], #52]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #52]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r9, [%[m], #56]\n\t"
        "LDR	r12, [%[a], #56]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #56]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r9, [%[m], #60]\n\t"
        "LDR	r12, [%[a], #60]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #60]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r9, [%[m], #64]\n\t"
        "LDR	r12, [%[a], #64]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #64]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r9, [%[m], #68]\n\t"
        "LDR	r12, [%[a], #68]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #68]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r9, [%[m], #72]\n\t"
        "LDR	r12, [%[a], #72]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #72]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r9, [%[m], #76]\n\t"
        "LDR	r12, [%[a], #76]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #76]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r9, [%[m], #80]\n\t"
        "LDR	r12, [%[a], #80]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #80]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r9, [%[m], #84]\n\t"
        "LDR	r12, [%[a], #84]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #84]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r9, [%[m], #88]\n\t"
        "LDR	r12, [%[a], #88]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #88]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r9, [%[m], #92]\n\t"
        "LDR	r12, [%[a], #92]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #92]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r9, [%[m], #96]\n\t"
        "LDR	r12, [%[a], #96]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #96]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r9, [%[m], #100]\n\t"
        "LDR	r12, [%[a], #100]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #100]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r9, [%[m], #104]\n\t"
        "LDR	r12, [%[a], #104]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #104]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r9, [%[m], #108]\n\t"
        "LDR	r12, [%[a], #108]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #108]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r9, [%[m], #112]\n\t"
        "LDR	r12, [%[a], #112]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #112]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r9, [%[m], #116]\n\t"
        "LDR	r12, [%[a], #116]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #116]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r9, [%[m], #120]\n\t"
        "LDR	r12, [%[a], #120]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #120]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r9, [%[m], #124]\n\t"
        "LDR	r12, [%[a], #124]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #124]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+32] += m[32] * mu */
        "LDR	r9, [%[m], #128]\n\t"
        "LDR	r12, [%[a], #128]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #128]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+33] += m[33] * mu */
        "LDR	r9, [%[m], #132]\n\t"
        "LDR	r12, [%[a], #132]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #132]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+34] += m[34] * mu */
        "LDR	r9, [%[m], #136]\n\t"
        "LDR	r12, [%[a], #136]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #136]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+35] += m[35] * mu */
        "LDR	r9, [%[m], #140]\n\t"
        "LDR	r12, [%[a], #140]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #140]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+36] += m[36] * mu */
        "LDR	r9, [%[m], #144]\n\t"
        "LDR	r12, [%[a], #144]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #144]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+37] += m[37] * mu */
        "LDR	r9, [%[m], #148]\n\t"
        "LDR	r12, [%[a], #148]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #148]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+38] += m[38] * mu */
        "LDR	r9, [%[m], #152]\n\t"
        "LDR	r12, [%[a], #152]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #152]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+39] += m[39] * mu */
        "LDR	r9, [%[m], #156]\n\t"
        "LDR	r12, [%[a], #156]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #156]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+40] += m[40] * mu */
        "LDR	r9, [%[m], #160]\n\t"
        "LDR	r12, [%[a], #160]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #160]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+41] += m[41] * mu */
        "LDR	r9, [%[m], #164]\n\t"
        "LDR	r12, [%[a], #164]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #164]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+42] += m[42] * mu */
        "LDR	r9, [%[m], #168]\n\t"
        "LDR	r12, [%[a], #168]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #168]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+43] += m[43] * mu */
        "LDR	r9, [%[m], #172]\n\t"
        "LDR	r12, [%[a], #172]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #172]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+44] += m[44] * mu */
        "LDR	r9, [%[m], #176]\n\t"
        "LDR	r12, [%[a], #176]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #176]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+45] += m[45] * mu */
        "LDR	r9, [%[m], #180]\n\t"
        "LDR	r12, [%[a], #180]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #180]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+46] += m[46] * mu */
        "LDR	r9, [%[m], #184]\n\t"
        "LDR	r12, [%[a], #184]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #184]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+47] += m[47] * mu */
        "LDR	r9, [%[m], #188]\n\t"
        "LDR	r12, [%[a], #188]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #188]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+48] += m[48] * mu */
        "LDR	r9, [%[m], #192]\n\t"
        "LDR	r12, [%[a], #192]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #192]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+49] += m[49] * mu */
        "LDR	r9, [%[m], #196]\n\t"
        "LDR	r12, [%[a], #196]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #196]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+50] += m[50] * mu */
        "LDR	r9, [%[m], #200]\n\t"
        "LDR	r12, [%[a], #200]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #200]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+51] += m[51] * mu */
        "LDR	r9, [%[m], #204]\n\t"
        "LDR	r12, [%[a], #204]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #204]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+52] += m[52] * mu */
        "LDR	r9, [%[m], #208]\n\t"
        "LDR	r12, [%[a], #208]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #208]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+53] += m[53] * mu */
        "LDR	r9, [%[m], #212]\n\t"
        "LDR	r12, [%[a], #212]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #212]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+54] += m[54] * mu */
        "LDR	r9, [%[m], #216]\n\t"
        "LDR	r12, [%[a], #216]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #216]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+55] += m[55] * mu */
        "LDR	r9, [%[m], #220]\n\t"
        "LDR	r12, [%[a], #220]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #220]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+56] += m[56] * mu */
        "LDR	r9, [%[m], #224]\n\t"
        "LDR	r12, [%[a], #224]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #224]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+57] += m[57] * mu */
        "LDR	r9, [%[m], #228]\n\t"
        "LDR	r12, [%[a], #228]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #228]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+58] += m[58] * mu */
        "LDR	r9, [%[m], #232]\n\t"
        "LDR	r12, [%[a], #232]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #232]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+59] += m[59] * mu */
        "LDR	r9, [%[m], #236]\n\t"
        "LDR	r12, [%[a], #236]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #236]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+60] += m[60] * mu */
        "LDR	r9, [%[m], #240]\n\t"
        "LDR	r12, [%[a], #240]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #240]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+61] += m[61] * mu */
        "LDR	r9, [%[m], #244]\n\t"
        "LDR	r12, [%[a], #244]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #244]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+62] += m[62] * mu */
        "LDR	r9, [%[m], #248]\n\t"
        "LDR	r12, [%[a], #248]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #248]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+63] += m[63] * mu */
        "LDR	r9, [%[m], #252]\n\t"
        "LDR	r12, [%[a], #252]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #252]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+64] += m[64] * mu */
        "LDR	r9, [%[m], #256]\n\t"
        "LDR	r12, [%[a], #256]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #256]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+65] += m[65] * mu */
        "LDR	r9, [%[m], #260]\n\t"
        "LDR	r12, [%[a], #260]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #260]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+66] += m[66] * mu */
        "LDR	r9, [%[m], #264]\n\t"
        "LDR	r12, [%[a], #264]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #264]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+67] += m[67] * mu */
        "LDR	r9, [%[m], #268]\n\t"
        "LDR	r12, [%[a], #268]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #268]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+68] += m[68] * mu */
        "LDR	r9, [%[m], #272]\n\t"
        "LDR	r12, [%[a], #272]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #272]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+69] += m[69] * mu */
        "LDR	r9, [%[m], #276]\n\t"
        "LDR	r12, [%[a], #276]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #276]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+70] += m[70] * mu */
        "LDR	r9, [%[m], #280]\n\t"
        "LDR	r12, [%[a], #280]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #280]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+71] += m[71] * mu */
        "LDR	r9, [%[m], #284]\n\t"
        "LDR	r12, [%[a], #284]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #284]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+72] += m[72] * mu */
        "LDR	r9, [%[m], #288]\n\t"
        "LDR	r12, [%[a], #288]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #288]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+73] += m[73] * mu */
        "LDR	r9, [%[m], #292]\n\t"
        "LDR	r12, [%[a], #292]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #292]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+74] += m[74] * mu */
        "LDR	r9, [%[m], #296]\n\t"
        "LDR	r12, [%[a], #296]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #296]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+75] += m[75] * mu */
        "LDR	r9, [%[m], #300]\n\t"
        "LDR	r12, [%[a], #300]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #300]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+76] += m[76] * mu */
        "LDR	r9, [%[m], #304]\n\t"
        "LDR	r12, [%[a], #304]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #304]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+77] += m[77] * mu */
        "LDR	r9, [%[m], #308]\n\t"
        "LDR	r12, [%[a], #308]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #308]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+78] += m[78] * mu */
        "LDR	r9, [%[m], #312]\n\t"
        "LDR	r12, [%[a], #312]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #312]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+79] += m[79] * mu */
        "LDR	r9, [%[m], #316]\n\t"
        "LDR	r12, [%[a], #316]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #316]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+80] += m[80] * mu */
        "LDR	r9, [%[m], #320]\n\t"
        "LDR	r12, [%[a], #320]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #320]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+81] += m[81] * mu */
        "LDR	r9, [%[m], #324]\n\t"
        "LDR	r12, [%[a], #324]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #324]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+82] += m[82] * mu */
        "LDR	r9, [%[m], #328]\n\t"
        "LDR	r12, [%[a], #328]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #328]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+83] += m[83] * mu */
        "LDR	r9, [%[m], #332]\n\t"
        "LDR	r12, [%[a], #332]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #332]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+84] += m[84] * mu */
        "LDR	r9, [%[m], #336]\n\t"
        "LDR	r12, [%[a], #336]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #336]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+85] += m[85] * mu */
        "LDR	r9, [%[m], #340]\n\t"
        "LDR	r12, [%[a], #340]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #340]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+86] += m[86] * mu */
        "LDR	r9, [%[m], #344]\n\t"
        "LDR	r12, [%[a], #344]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #344]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+87] += m[87] * mu */
        "LDR	r9, [%[m], #348]\n\t"
        "LDR	r12, [%[a], #348]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #348]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+88] += m[88] * mu */
        "LDR	r9, [%[m], #352]\n\t"
        "LDR	r12, [%[a], #352]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #352]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+89] += m[89] * mu */
        "LDR	r9, [%[m], #356]\n\t"
        "LDR	r12, [%[a], #356]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #356]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+90] += m[90] * mu */
        "LDR	r9, [%[m], #360]\n\t"
        "LDR	r12, [%[a], #360]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #360]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+91] += m[91] * mu */
        "LDR	r9, [%[m], #364]\n\t"
        "LDR	r12, [%[a], #364]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #364]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+92] += m[92] * mu */
        "LDR	r9, [%[m], #368]\n\t"
        "LDR	r12, [%[a], #368]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #368]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+93] += m[93] * mu */
        "LDR	r9, [%[m], #372]\n\t"
        "LDR	r12, [%[a], #372]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #372]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+94] += m[94] * mu */
        "LDR	r9, [%[m], #376]\n\t"
        "LDR	r12, [%[a], #376]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #376]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+95] += m[95] * mu */
        "LDR	r9, [%[m], #380]\n\t"
        "LDR	r12, [%[a], #380]\n\t"
        "UMULL	r8, r9, r10, r9\n\t"
        "ADDS	r7, r7, r8\n\t"
        "ADCS	r6, r9, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, r3\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #380]\n\t"
        "LDR	r12, [%[a], #384]\n\t"
        "ADCS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #384]\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* i += 1 */
        "ADD	r11, r11, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r11, #0x180\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_96_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_3072_mont_reduce_96_word\n\t"
#else
        "BLT.W	L_sp_3072_mont_reduce_96_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r4, [%[a]]\n\t"
        "STR	r5, [%[a], #4]\n\t"
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_96_word:\n\t"
#else
    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_96_mul:\n\t"
#else
    "L_sp_3072_mont_reduce_96_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0x180\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_96_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mont_reduce_96_mul\n\t"
#else
        "BLT.N	L_sp_3072_mont_reduce_96_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #384]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #384]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0x180\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_96_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mont_reduce_96_word\n\t"
#else
        "BLT.N	L_sp_3072_mont_reduce_96_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#else
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* i = 0 */
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "LDR	r6, [%[a]]\n\t"
        "LDR	r7, [%[a], #4]\n\t"
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[a], #12]\n\t"
        "LDR	r10, [%[a], #16]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_96_word:\n\t"
#else
    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	lr, %[mp], r6\n\t"
        /* a[i+0] += m[0] * mu */
        "LDR	r12, [%[m]]\n\t"
        "MOV	r3, #0x0\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r12, [%[m], #4]\n\t"
        "MOV	r6, r7\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r12, [%[m], #8]\n\t"
        "MOV	r7, r8\n\t"
        "UMAAL	r7, r3, lr, r12\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r12, [%[m], #12]\n\t"
        "MOV	r8, r9\n\t"
        "UMAAL	r8, r3, lr, r12\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r12, [%[m], #16]\n\t"
        "MOV	r9, r10\n\t"
        "UMAAL	r9, r3, lr, r12\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r12, [%[m], #20]\n\t"
        "LDR	r10, [%[a], #20]\n\t"
        "UMAAL	r10, r3, lr, r12\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r12, [%[m], #24]\n\t"
        "LDR	r11, [%[a], #24]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #24]\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r12, [%[m], #28]\n\t"
        "LDR	r11, [%[a], #28]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #28]\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r12, [%[m], #32]\n\t"
        "LDR	r11, [%[a], #32]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #32]\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r12, [%[m], #36]\n\t"
        "LDR	r11, [%[a], #36]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #36]\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r12, [%[m], #40]\n\t"
        "LDR	r11, [%[a], #40]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #40]\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r12, [%[m], #44]\n\t"
        "LDR	r11, [%[a], #44]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #44]\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r12, [%[m], #48]\n\t"
        "LDR	r11, [%[a], #48]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #48]\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r12, [%[m], #52]\n\t"
        "LDR	r11, [%[a], #52]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #52]\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r12, [%[m], #56]\n\t"
        "LDR	r11, [%[a], #56]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #56]\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r12, [%[m], #60]\n\t"
        "LDR	r11, [%[a], #60]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #60]\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r12, [%[m], #64]\n\t"
        "LDR	r11, [%[a], #64]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #64]\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r12, [%[m], #68]\n\t"
        "LDR	r11, [%[a], #68]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #68]\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r12, [%[m], #72]\n\t"
        "LDR	r11, [%[a], #72]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #72]\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r12, [%[m], #76]\n\t"
        "LDR	r11, [%[a], #76]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #76]\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r12, [%[m], #80]\n\t"
        "LDR	r11, [%[a], #80]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #80]\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r12, [%[m], #84]\n\t"
        "LDR	r11, [%[a], #84]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #84]\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r12, [%[m], #88]\n\t"
        "LDR	r11, [%[a], #88]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #88]\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r12, [%[m], #92]\n\t"
        "LDR	r11, [%[a], #92]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #92]\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r12, [%[m], #96]\n\t"
        "LDR	r11, [%[a], #96]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #96]\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r12, [%[m], #100]\n\t"
        "LDR	r11, [%[a], #100]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #100]\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r12, [%[m], #104]\n\t"
        "LDR	r11, [%[a], #104]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #104]\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r12, [%[m], #108]\n\t"
        "LDR	r11, [%[a], #108]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #108]\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r12, [%[m], #112]\n\t"
        "LDR	r11, [%[a], #112]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #112]\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r12, [%[m], #116]\n\t"
        "LDR	r11, [%[a], #116]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #116]\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r12, [%[m], #120]\n\t"
        "LDR	r11, [%[a], #120]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #120]\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r12, [%[m], #124]\n\t"
        "LDR	r11, [%[a], #124]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #124]\n\t"
        /* a[i+32] += m[32] * mu */
        "LDR	r12, [%[m], #128]\n\t"
        "LDR	r11, [%[a], #128]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #128]\n\t"
        /* a[i+33] += m[33] * mu */
        "LDR	r12, [%[m], #132]\n\t"
        "LDR	r11, [%[a], #132]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #132]\n\t"
        /* a[i+34] += m[34] * mu */
        "LDR	r12, [%[m], #136]\n\t"
        "LDR	r11, [%[a], #136]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #136]\n\t"
        /* a[i+35] += m[35] * mu */
        "LDR	r12, [%[m], #140]\n\t"
        "LDR	r11, [%[a], #140]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #140]\n\t"
        /* a[i+36] += m[36] * mu */
        "LDR	r12, [%[m], #144]\n\t"
        "LDR	r11, [%[a], #144]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #144]\n\t"
        /* a[i+37] += m[37] * mu */
        "LDR	r12, [%[m], #148]\n\t"
        "LDR	r11, [%[a], #148]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #148]\n\t"
        /* a[i+38] += m[38] * mu */
        "LDR	r12, [%[m], #152]\n\t"
        "LDR	r11, [%[a], #152]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #152]\n\t"
        /* a[i+39] += m[39] * mu */
        "LDR	r12, [%[m], #156]\n\t"
        "LDR	r11, [%[a], #156]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #156]\n\t"
        /* a[i+40] += m[40] * mu */
        "LDR	r12, [%[m], #160]\n\t"
        "LDR	r11, [%[a], #160]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #160]\n\t"
        /* a[i+41] += m[41] * mu */
        "LDR	r12, [%[m], #164]\n\t"
        "LDR	r11, [%[a], #164]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #164]\n\t"
        /* a[i+42] += m[42] * mu */
        "LDR	r12, [%[m], #168]\n\t"
        "LDR	r11, [%[a], #168]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #168]\n\t"
        /* a[i+43] += m[43] * mu */
        "LDR	r12, [%[m], #172]\n\t"
        "LDR	r11, [%[a], #172]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #172]\n\t"
        /* a[i+44] += m[44] * mu */
        "LDR	r12, [%[m], #176]\n\t"
        "LDR	r11, [%[a], #176]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #176]\n\t"
        /* a[i+45] += m[45] * mu */
        "LDR	r12, [%[m], #180]\n\t"
        "LDR	r11, [%[a], #180]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #180]\n\t"
        /* a[i+46] += m[46] * mu */
        "LDR	r12, [%[m], #184]\n\t"
        "LDR	r11, [%[a], #184]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #184]\n\t"
        /* a[i+47] += m[47] * mu */
        "LDR	r12, [%[m], #188]\n\t"
        "LDR	r11, [%[a], #188]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #188]\n\t"
        /* a[i+48] += m[48] * mu */
        "LDR	r12, [%[m], #192]\n\t"
        "LDR	r11, [%[a], #192]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #192]\n\t"
        /* a[i+49] += m[49] * mu */
        "LDR	r12, [%[m], #196]\n\t"
        "LDR	r11, [%[a], #196]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #196]\n\t"
        /* a[i+50] += m[50] * mu */
        "LDR	r12, [%[m], #200]\n\t"
        "LDR	r11, [%[a], #200]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #200]\n\t"
        /* a[i+51] += m[51] * mu */
        "LDR	r12, [%[m], #204]\n\t"
        "LDR	r11, [%[a], #204]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #204]\n\t"
        /* a[i+52] += m[52] * mu */
        "LDR	r12, [%[m], #208]\n\t"
        "LDR	r11, [%[a], #208]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #208]\n\t"
        /* a[i+53] += m[53] * mu */
        "LDR	r12, [%[m], #212]\n\t"
        "LDR	r11, [%[a], #212]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #212]\n\t"
        /* a[i+54] += m[54] * mu */
        "LDR	r12, [%[m], #216]\n\t"
        "LDR	r11, [%[a], #216]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #216]\n\t"
        /* a[i+55] += m[55] * mu */
        "LDR	r12, [%[m], #220]\n\t"
        "LDR	r11, [%[a], #220]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #220]\n\t"
        /* a[i+56] += m[56] * mu */
        "LDR	r12, [%[m], #224]\n\t"
        "LDR	r11, [%[a], #224]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #224]\n\t"
        /* a[i+57] += m[57] * mu */
        "LDR	r12, [%[m], #228]\n\t"
        "LDR	r11, [%[a], #228]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #228]\n\t"
        /* a[i+58] += m[58] * mu */
        "LDR	r12, [%[m], #232]\n\t"
        "LDR	r11, [%[a], #232]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #232]\n\t"
        /* a[i+59] += m[59] * mu */
        "LDR	r12, [%[m], #236]\n\t"
        "LDR	r11, [%[a], #236]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #236]\n\t"
        /* a[i+60] += m[60] * mu */
        "LDR	r12, [%[m], #240]\n\t"
        "LDR	r11, [%[a], #240]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #240]\n\t"
        /* a[i+61] += m[61] * mu */
        "LDR	r12, [%[m], #244]\n\t"
        "LDR	r11, [%[a], #244]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #244]\n\t"
        /* a[i+62] += m[62] * mu */
        "LDR	r12, [%[m], #248]\n\t"
        "LDR	r11, [%[a], #248]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #248]\n\t"
        /* a[i+63] += m[63] * mu */
        "LDR	r12, [%[m], #252]\n\t"
        "LDR	r11, [%[a], #252]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #252]\n\t"
        /* a[i+64] += m[64] * mu */
        "LDR	r12, [%[m], #256]\n\t"
        "LDR	r11, [%[a], #256]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #256]\n\t"
        /* a[i+65] += m[65] * mu */
        "LDR	r12, [%[m], #260]\n\t"
        "LDR	r11, [%[a], #260]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #260]\n\t"
        /* a[i+66] += m[66] * mu */
        "LDR	r12, [%[m], #264]\n\t"
        "LDR	r11, [%[a], #264]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #264]\n\t"
        /* a[i+67] += m[67] * mu */
        "LDR	r12, [%[m], #268]\n\t"
        "LDR	r11, [%[a], #268]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #268]\n\t"
        /* a[i+68] += m[68] * mu */
        "LDR	r12, [%[m], #272]\n\t"
        "LDR	r11, [%[a], #272]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #272]\n\t"
        /* a[i+69] += m[69] * mu */
        "LDR	r12, [%[m], #276]\n\t"
        "LDR	r11, [%[a], #276]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #276]\n\t"
        /* a[i+70] += m[70] * mu */
        "LDR	r12, [%[m], #280]\n\t"
        "LDR	r11, [%[a], #280]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #280]\n\t"
        /* a[i+71] += m[71] * mu */
        "LDR	r12, [%[m], #284]\n\t"
        "LDR	r11, [%[a], #284]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #284]\n\t"
        /* a[i+72] += m[72] * mu */
        "LDR	r12, [%[m], #288]\n\t"
        "LDR	r11, [%[a], #288]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #288]\n\t"
        /* a[i+73] += m[73] * mu */
        "LDR	r12, [%[m], #292]\n\t"
        "LDR	r11, [%[a], #292]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #292]\n\t"
        /* a[i+74] += m[74] * mu */
        "LDR	r12, [%[m], #296]\n\t"
        "LDR	r11, [%[a], #296]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #296]\n\t"
        /* a[i+75] += m[75] * mu */
        "LDR	r12, [%[m], #300]\n\t"
        "LDR	r11, [%[a], #300]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #300]\n\t"
        /* a[i+76] += m[76] * mu */
        "LDR	r12, [%[m], #304]\n\t"
        "LDR	r11, [%[a], #304]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #304]\n\t"
        /* a[i+77] += m[77] * mu */
        "LDR	r12, [%[m], #308]\n\t"
        "LDR	r11, [%[a], #308]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #308]\n\t"
        /* a[i+78] += m[78] * mu */
        "LDR	r12, [%[m], #312]\n\t"
        "LDR	r11, [%[a], #312]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #312]\n\t"
        /* a[i+79] += m[79] * mu */
        "LDR	r12, [%[m], #316]\n\t"
        "LDR	r11, [%[a], #316]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #316]\n\t"
        /* a[i+80] += m[80] * mu */
        "LDR	r12, [%[m], #320]\n\t"
        "LDR	r11, [%[a], #320]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #320]\n\t"
        /* a[i+81] += m[81] * mu */
        "LDR	r12, [%[m], #324]\n\t"
        "LDR	r11, [%[a], #324]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #324]\n\t"
        /* a[i+82] += m[82] * mu */
        "LDR	r12, [%[m], #328]\n\t"
        "LDR	r11, [%[a], #328]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #328]\n\t"
        /* a[i+83] += m[83] * mu */
        "LDR	r12, [%[m], #332]\n\t"
        "LDR	r11, [%[a], #332]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #332]\n\t"
        /* a[i+84] += m[84] * mu */
        "LDR	r12, [%[m], #336]\n\t"
        "LDR	r11, [%[a], #336]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #336]\n\t"
        /* a[i+85] += m[85] * mu */
        "LDR	r12, [%[m], #340]\n\t"
        "LDR	r11, [%[a], #340]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #340]\n\t"
        /* a[i+86] += m[86] * mu */
        "LDR	r12, [%[m], #344]\n\t"
        "LDR	r11, [%[a], #344]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #344]\n\t"
        /* a[i+87] += m[87] * mu */
        "LDR	r12, [%[m], #348]\n\t"
        "LDR	r11, [%[a], #348]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #348]\n\t"
        /* a[i+88] += m[88] * mu */
        "LDR	r12, [%[m], #352]\n\t"
        "LDR	r11, [%[a], #352]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #352]\n\t"
        /* a[i+89] += m[89] * mu */
        "LDR	r12, [%[m], #356]\n\t"
        "LDR	r11, [%[a], #356]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #356]\n\t"
        /* a[i+90] += m[90] * mu */
        "LDR	r12, [%[m], #360]\n\t"
        "LDR	r11, [%[a], #360]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #360]\n\t"
        /* a[i+91] += m[91] * mu */
        "LDR	r12, [%[m], #364]\n\t"
        "LDR	r11, [%[a], #364]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #364]\n\t"
        /* a[i+92] += m[92] * mu */
        "LDR	r12, [%[m], #368]\n\t"
        "LDR	r11, [%[a], #368]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #368]\n\t"
        /* a[i+93] += m[93] * mu */
        "LDR	r12, [%[m], #372]\n\t"
        "LDR	r11, [%[a], #372]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #372]\n\t"
        /* a[i+94] += m[94] * mu */
        "LDR	r12, [%[m], #376]\n\t"
        "LDR	r11, [%[a], #376]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #376]\n\t"
        /* a[i+95] += m[95] * mu */
        "LDR	r12, [%[m], #380]\n\t"
        "LDR	r11, [%[a], #380]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "LDR	lr, [%[a], #384]\n\t"
        "MOV	r12, #0x0\n\t"
        "UMAAL	r3, lr, r12, r12\n\t"
        "STR	r11, [%[a], #380]\n\t"
        "ADDS	r3, r3, r5\n\t"
        "ADC	r5, lr, #0x0\n\t"
        "STR	r3, [%[a], #384]\n\t"
        /* i += 1 */
        "ADD	r4, r4, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r4, #0x180\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_96_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_3072_mont_reduce_96_word\n\t"
#else
        "BLT.W	L_sp_3072_mont_reduce_96_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r6, [%[a]]\n\t"
        "STR	r7, [%[a], #4]\n\t"
        "STR	r8, [%[a], #8]\n\t"
        "STR	r9, [%[a], #12]\n\t"
        "STR	r10, [%[a], #16]\n\t"
        "MOV	%[mp], r5\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_96_word:\n\t"
#else
    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_mont_reduce_96_mul:\n\t"
#else
    "L_sp_3072_mont_reduce_96_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0x180\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_96_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mont_reduce_96_mul\n\t"
#else
        "BLT.N	L_sp_3072_mont_reduce_96_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #384]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #384]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0x180\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_mont_reduce_96_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_mont_reduce_96_word\n\t"
#else
        "BLT.N	L_sp_3072_mont_reduce_96_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#endif
/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_mul_96(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_3072_mul_96(r, a, b);
    sp_3072_mont_reduce_96(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_3072_sqr_96(r, a);
    sp_3072_mont_reduce_96(r, m, mp);
}

#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r11, #0x0\n\t"
        "ADD	r12, %[a], #0x180\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_sub_96_word:\n\t"
#else
    "L_sp_3072_sub_96_word_%=:\n\t"
#endif
        "RSBS	r11, r11, #0x0\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	r11, r3, r3\n\t"
        "CMP	%[a], r12\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_3072_sub_96_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_3072_sub_96_word\n\t"
#else
        "BNE.N	L_sp_3072_sub_96_word_%=\n\t"
#endif
        "MOV	%[r], r11\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SUBS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_USE_UDIV
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r8, %[div], #16\n\t"
        "ADD	r5, r8, #0x1\n\t"
        "UDIV	r6, %[d1], r5\n\t"
        "LSL	r7, %[div], #16\n\t"
        "LSL	r6, r6, #16\n\t"
        "UMULL	r3, r4, %[div], r6\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "SUBS	r3, %[d1], r5\n\t"
        "SBC	r9, r9, r9\n\t"
        "ADD	r9, r9, #0x1\n\t"
        "RSB	r10, r9, #0x0\n\t"
        "LSL	r9, r9, #16\n\t"
        "AND	r7, r7, r10\n\t"
        "AND	r8, r8, r10\n\t"
        "SUBS	%[d0], %[d0], r7\n\t"
        "ADD	r6, r6, r9\n\t"
        "SBC	%[d1], %[d1], r8\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "UMULL	r3, r4, %[div], r3\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "MUL	r3, %[div], r3\n\t"
        "SUB	%[d0], %[d0], r3\n\t"
        "UDIV	r3, %[d0], %[div]\n\t"
        "ADD	%[d1], r6, r3\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#else
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r5, %[div], #1\n\t"
        "ADD	r5, r5, #0x1\n\t"
        "MOV	r6, %[d0]\n\t"
        "MOV	r7, %[d1]\n\t"
        /* Do top 32 */
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "MOV	r3, #0x0\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        /* Next 30 bits */
        "MOV	r4, #0x1d\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_div_3072_word_96_bit:\n\t"
#else
    "L_div_3072_word_96_bit_%=:\n\t"
#endif
        "LSLS	r6, r6, #1\n\t"
        "ADC	r7, r7, r7\n\t"
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "ADD	r3, r3, r3\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        "SUBS	r4, r4, #0x1\n\t"
#if defined(__GNUC__)
        "BPL	L_div_3072_word_96_bit_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BPL.N	L_div_3072_word_96_bit\n\t"
#else
        "BPL.N	L_div_3072_word_96_bit_%=\n\t"
#endif
        "ADD	r3, r3, r3\n\t"
        "ADD	r3, r3, #0x1\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "SUBS	r8, %[div], r9\n\t"
        "SBC	r8, r8, r8\n\t"
        "SUB	%[d1], r3, r8\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#endif
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d,
        sp_digit* m, sp_digit* r)
{
    sp_digit t1[192], t2[97];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[95];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 96);
    for (i = 95; i > 0; i--) {
        if (t1[i + 96] != d[i])
            break;
    }
    if (t1[i + 96] >= d[i]) {
        sp_3072_sub_in_place_96(&t1[96], d);
    }
    for (i = 95; i >= 0; i--) {
        if (t1[96 + i] == div) {
            r1 = SP_DIGIT_MAX;
        }
        else {
            r1 = div_3072_word_96(t1[96 + i], t1[96 + i - 1], div);
        }

        sp_3072_mul_d_96(t2, d, r1);
        t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2);
        t1[96 + i] -= t2[96];
        if (t1[96 + i] != 0) {
            t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d);
            if (t1[96 + i] != 0)
                t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d);
        }
    }

    for (i = 95; i > 0; i--) {
        if (t1[i] != d[i])
            break;
    }
    if (t1[i] >= d[i]) {
        sp_3072_sub_96(r, t1, d);
    }
    else {
        XMEMCPY(r, t1, sizeof(*t1) * 96);
    }

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_3072_div_96_cond(a, m, NULL, r);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY)
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<96; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 96; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_int32 sp_3072_cmp_96(const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r2, #0xffffffff\n\t"
        "MOV	r8, #0x1\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r3, #0xffffffff\n\t"
#ifdef WOLFSSL_SP_SMALL
        "MOV	r6, #0x17c\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_cmp_96_words:\n\t"
#else
    "L_sp_3072_cmp_96_words_%=:\n\t"
#endif
        "LDR	r4, [%[a], r6]\n\t"
        "LDR	r5, [%[b], r6]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "SUBS	r6, r6, #0x4\n\t"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "bcs	L_sp_3072_cmp_96_words\n\t"
#else
        "bcs	L_sp_3072_cmp_96_words_%=\n\t"
#endif
        "EOR	r2, r2, r3\n\t"
#else
        "LDR	r4, [%[a], #380]\n\t"
        "LDR	r5, [%[b], #380]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #376]\n\t"
        "LDR	r5, [%[b], #376]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #372]\n\t"
        "LDR	r5, [%[b], #372]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #368]\n\t"
        "LDR	r5, [%[b], #368]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #364]\n\t"
        "LDR	r5, [%[b], #364]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #360]\n\t"
        "LDR	r5, [%[b], #360]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #356]\n\t"
        "LDR	r5, [%[b], #356]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #352]\n\t"
        "LDR	r5, [%[b], #352]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #348]\n\t"
        "LDR	r5, [%[b], #348]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #344]\n\t"
        "LDR	r5, [%[b], #344]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #340]\n\t"
        "LDR	r5, [%[b], #340]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #336]\n\t"
        "LDR	r5, [%[b], #336]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #332]\n\t"
        "LDR	r5, [%[b], #332]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #328]\n\t"
        "LDR	r5, [%[b], #328]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #324]\n\t"
        "LDR	r5, [%[b], #324]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #320]\n\t"
        "LDR	r5, [%[b], #320]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #316]\n\t"
        "LDR	r5, [%[b], #316]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #312]\n\t"
        "LDR	r5, [%[b], #312]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #308]\n\t"
        "LDR	r5, [%[b], #308]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #304]\n\t"
        "LDR	r5, [%[b], #304]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #300]\n\t"
        "LDR	r5, [%[b], #300]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #296]\n\t"
        "LDR	r5, [%[b], #296]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #292]\n\t"
        "LDR	r5, [%[b], #292]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #288]\n\t"
        "LDR	r5, [%[b], #288]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #284]\n\t"
        "LDR	r5, [%[b], #284]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #280]\n\t"
        "LDR	r5, [%[b], #280]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #276]\n\t"
        "LDR	r5, [%[b], #276]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #272]\n\t"
        "LDR	r5, [%[b], #272]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #268]\n\t"
        "LDR	r5, [%[b], #268]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #264]\n\t"
        "LDR	r5, [%[b], #264]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #260]\n\t"
        "LDR	r5, [%[b], #260]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #256]\n\t"
        "LDR	r5, [%[b], #256]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #252]\n\t"
        "LDR	r5, [%[b], #252]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #248]\n\t"
        "LDR	r5, [%[b], #248]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #244]\n\t"
        "LDR	r5, [%[b], #244]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #240]\n\t"
        "LDR	r5, [%[b], #240]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #236]\n\t"
        "LDR	r5, [%[b], #236]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #232]\n\t"
        "LDR	r5, [%[b], #232]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #228]\n\t"
        "LDR	r5, [%[b], #228]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #224]\n\t"
        "LDR	r5, [%[b], #224]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #220]\n\t"
        "LDR	r5, [%[b], #220]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #216]\n\t"
        "LDR	r5, [%[b], #216]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #212]\n\t"
        "LDR	r5, [%[b], #212]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #208]\n\t"
        "LDR	r5, [%[b], #208]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #204]\n\t"
        "LDR	r5, [%[b], #204]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #200]\n\t"
        "LDR	r5, [%[b], #200]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #196]\n\t"
        "LDR	r5, [%[b], #196]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #192]\n\t"
        "LDR	r5, [%[b], #192]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #188]\n\t"
        "LDR	r5, [%[b], #188]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #184]\n\t"
        "LDR	r5, [%[b], #184]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #180]\n\t"
        "LDR	r5, [%[b], #180]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #176]\n\t"
        "LDR	r5, [%[b], #176]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #172]\n\t"
        "LDR	r5, [%[b], #172]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #168]\n\t"
        "LDR	r5, [%[b], #168]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #164]\n\t"
        "LDR	r5, [%[b], #164]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #160]\n\t"
        "LDR	r5, [%[b], #160]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #156]\n\t"
        "LDR	r5, [%[b], #156]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #152]\n\t"
        "LDR	r5, [%[b], #152]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #148]\n\t"
        "LDR	r5, [%[b], #148]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #144]\n\t"
        "LDR	r5, [%[b], #144]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #140]\n\t"
        "LDR	r5, [%[b], #140]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #136]\n\t"
        "LDR	r5, [%[b], #136]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #132]\n\t"
        "LDR	r5, [%[b], #132]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #128]\n\t"
        "LDR	r5, [%[b], #128]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #124]\n\t"
        "LDR	r5, [%[b], #124]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #120]\n\t"
        "LDR	r5, [%[b], #120]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #116]\n\t"
        "LDR	r5, [%[b], #116]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #112]\n\t"
        "LDR	r5, [%[b], #112]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #108]\n\t"
        "LDR	r5, [%[b], #108]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #104]\n\t"
        "LDR	r5, [%[b], #104]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #100]\n\t"
        "LDR	r5, [%[b], #100]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #96]\n\t"
        "LDR	r5, [%[b], #96]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #92]\n\t"
        "LDR	r5, [%[b], #92]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #88]\n\t"
        "LDR	r5, [%[b], #88]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #84]\n\t"
        "LDR	r5, [%[b], #84]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #80]\n\t"
        "LDR	r5, [%[b], #80]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #76]\n\t"
        "LDR	r5, [%[b], #76]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #72]\n\t"
        "LDR	r5, [%[b], #72]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #68]\n\t"
        "LDR	r5, [%[b], #68]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #64]\n\t"
        "LDR	r5, [%[b], #64]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #60]\n\t"
        "LDR	r5, [%[b], #60]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #56]\n\t"
        "LDR	r5, [%[b], #56]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #52]\n\t"
        "LDR	r5, [%[b], #52]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #48]\n\t"
        "LDR	r5, [%[b], #48]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #44]\n\t"
        "LDR	r5, [%[b], #44]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #40]\n\t"
        "LDR	r5, [%[b], #40]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #36]\n\t"
        "LDR	r5, [%[b], #36]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #32]\n\t"
        "LDR	r5, [%[b], #32]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #28]\n\t"
        "LDR	r5, [%[b], #28]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #24]\n\t"
        "LDR	r5, [%[b], #24]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #20]\n\t"
        "LDR	r5, [%[b], #20]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #16]\n\t"
        "LDR	r5, [%[b], #16]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #12]\n\t"
        "LDR	r5, [%[b], #12]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #8]\n\t"
        "LDR	r5, [%[b], #8]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #4]\n\t"
        "LDR	r5, [%[b], #4]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[b]]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "EOR	r2, r2, r3\n\t"
#endif /*WOLFSSL_SP_SMALL */
        "MOV	%[a], r2\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)a;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d,
        sp_digit* m, sp_digit* r)
{
    sp_digit t1[192], t2[97];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[95];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 96);
    r1 = sp_3072_cmp_96(&t1[96], d) >= 0;
    sp_3072_cond_sub_96(&t1[96], &t1[96], d, (sp_digit)0 - r1);
    for (i = 95; i >= 0; i--) {
        volatile sp_digit mask = (sp_digit)0 - (t1[96 + i] == div);
        sp_digit hi = t1[96 + i] + mask;
        r1 = div_3072_word_96(hi, t1[96 + i - 1], div);
        r1 |= mask;

        sp_3072_mul_d_96(t2, d, r1);
        t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2);
        t1[96 + i] -= t2[96];
        sp_3072_mask_96(t2, d, t1[96 + i]);
        t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], t2);
        sp_3072_mask_96(t2, d, t1[96 + i]);
        t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], t2);
    }

    r1 = sp_3072_cmp_96(t1, d) >= 0;
    sp_3072_cond_sub_96(r, t1, d, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_3072_mod_96(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_3072_div_96(a, m, NULL, r);
}

#endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
                                                     defined(WOLFSSL_HAVE_SP_DH)
#ifdef WOLFSSL_SP_SMALL
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[8 * 192];
#endif
    sp_digit* t[8];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 192), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<8; i++) {
            t[i] = td + i * 192;
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_96(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 96U);
        if (reduceA != 0) {
            err = sp_3072_mod_96(t[1] + 96, a, m);
            if (err == MP_OKAY) {
                err = sp_3072_mod_96(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 96, a, sizeof(sp_digit) * 96);
            err = sp_3072_mod_96(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_96(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_96(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_96(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_96(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_96(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_96(t[ 7], t[ 4], t[ 3], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 3;
        if (c == 32) {
            c = 29;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 96);
        for (; i>=0 || c>=3; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 29);
                n <<= 3;
                c = 29;
            }
            else if (c < 3) {
                y = (byte)(n >> 29);
                n = e[i--];
                c = 3 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 29) & 0x7);
                n <<= 3;
                c -= 3;
            }

            sp_3072_mont_sqr_96(r, r, m, mp);
            sp_3072_mont_sqr_96(r, r, m, mp);
            sp_3072_mont_sqr_96(r, r, m, mp);

            sp_3072_mont_mul_96(r, r, t[y], m, mp);
        }

        XMEMSET(&r[96], 0, sizeof(sp_digit) * 96U);
        sp_3072_mont_reduce_96(r, m, mp);

        mask = (sp_digit)0 - (sp_3072_cmp_96(r, m) >= 0);
        sp_3072_cond_sub_96(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#else
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[16 * 192];
#endif
    sp_digit* t[16];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (16 * 192), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++) {
            t[i] = td + i * 192;
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_96(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 96U);
        if (reduceA != 0) {
            err = sp_3072_mod_96(t[1] + 96, a, m);
            if (err == MP_OKAY) {
                err = sp_3072_mod_96(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 96, a, sizeof(sp_digit) * 96);
            err = sp_3072_mod_96(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_96(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_96(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_96(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_96(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_96(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_96(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_96(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_96(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_96(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_96(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_96(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_96(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_96(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_96(t[15], t[ 8], t[ 7], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 4;
        if (c == 32) {
            c = 28;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 96);
        for (; i>=0 || c>=4; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 28);
                n <<= 4;
                c = 28;
            }
            else if (c < 4) {
                y = (byte)(n >> 28);
                n = e[i--];
                c = 4 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }

            sp_3072_mont_sqr_96(r, r, m, mp);
            sp_3072_mont_sqr_96(r, r, m, mp);
            sp_3072_mont_sqr_96(r, r, m, mp);
            sp_3072_mont_sqr_96(r, r, m, mp);

            sp_3072_mont_mul_96(r, r, t[y], m, mp);
        }

        XMEMSET(&r[96], 0, sizeof(sp_digit) * 96U);
        sp_3072_mont_reduce_96(r, m, mp);

        mask = (sp_digit)0 - (sp_3072_cmp_96(r, m) >= 0);
        sp_3072_cond_sub_96(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* WOLFSSL_SP_SMALL */
#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
    const mp_int* mm, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[96 * 5];
#endif
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_digit *ah = NULL;
    sp_digit e[1] = {0};
    int err = MP_OKAY;

    if (*outLen < 384) {
        err = MP_TO_E;
    }
    else if (mp_count_bits(em) > 32 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mm)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 96 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        ah = a + 96;
        r = a + 96 * 2;
        m = r + 96 * 2;

        sp_3072_from_bin(ah, 96, in, inLen);
#if DIGIT_BIT >= 32
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 96, mm);

        if (e[0] == 0x10001) {
            int i;
            sp_digit mp;

            sp_3072_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 96);
            err = sp_3072_mod_96_cond(r, a, m);
            /* Montgomery form: r = a.R mod m */

            if (err == MP_OKAY) {
                /* r = a ^ 0x10000 => r = a squared 16 times */
                for (i = 15; i >= 0; i--) {
                    sp_3072_mont_sqr_96(r, r, m, mp);
                }
                /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m
                 * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m
                 */
                sp_3072_mont_mul_96(r, r, ah, m, mp);

                for (i = 95; i > 0; i--) {
                    if (r[i] != m[i]) {
                        break;
                    }
                }
                if (r[i] >= m[i]) {
                    sp_3072_sub_in_place_96(r, m);
                }
            }
        }
        else if (e[0] == 0x3) {
            if (err == MP_OKAY) {
                sp_3072_sqr_96(r, ah);
                err = sp_3072_mod_96_cond(r, r, m);
            }
            if (err == MP_OKAY) {
                sp_3072_mul_96(r, ah, r);
                err = sp_3072_mod_96_cond(r, r, m);
            }
        }
        else {
            int i;
            sp_digit mp;

            sp_3072_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 96);
            err = sp_3072_mod_96_cond(a, a, m);

            if (err == MP_OKAY) {
                for (i = 31; i >= 0; i--) {
                    if (e[0] >> i) {
                        break;
                    }
                }

                XMEMCPY(r, a, sizeof(sp_digit) * 96);
                for (i--; i >= 0; i--) {
                    sp_3072_mont_sqr_96(r, r, m, mp);
                    if (((e[0] >> i) & 1) == 1) {
                        sp_3072_mont_mul_96(r, r, a, m, mp);
                    }
                }
                XMEMSET(&r[96], 0, sizeof(sp_digit) * 96);
                sp_3072_mont_reduce_96(r, m, mp);

                for (i = 95; i > 0; i--) {
                    if (r[i] != m[i]) {
                        break;
                    }
                }
                if (r[i] >= m[i]) {
                    sp_3072_sub_in_place_96(r, m);
                }
            }
        }
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_96(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
}

#ifndef WOLFSSL_RSA_PUBLIC_ONLY
#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r5, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_3072_cond_add_48_words:\n\t"
#else
    "L_sp_3072_cond_add_48_words_%=:\n\t"
#endif
        "ADDS	r5, r5, #0xffffffff\n\t"
        "LDR	r6, [%[a], r4]\n\t"
        "LDR	r7, [%[b], r4]\n\t"
        "AND	r7, r7, %[m]\n\t"
        "ADCS	r6, r6, r7\n\t"
        "ADC	r5, r8, r8\n\t"
        "STR	r6, [%[r], r4]\n\t"
        "ADD	r4, r4, #0x4\n\t"
        "CMP	r4, #0xc0\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_3072_cond_add_48_words_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_3072_cond_add_48_words\n\t"
#else
        "BLT.N	L_sp_3072_cond_add_48_words_%=\n\t"
#endif
        "MOV	%[r], r5\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r10, #0x0\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADDS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "ADC	%[r], r10, r10\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
{
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit  d[96 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 3072) {
           err = MP_READ_E;
        }
        else if (inLen > 384) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 96 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 96;
        m = a + 192;
        r = a;

        sp_3072_from_bin(a, 96, in, inLen);
        sp_3072_from_mp(d, 96, dm);
        sp_3072_from_mp(m, 96, mm);
        err = sp_3072_mod_exp_96(r, a, d, 3072, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_96(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 96);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[48 * 11];
#endif
    sp_digit* p = NULL;
    sp_digit* q = NULL;
    sp_digit* dp = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    sp_digit* qi = NULL;
    sp_digit* dq = NULL;
    sp_digit c;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 384) {
        err = MP_TO_E;
    }
    else if (inLen > 384 || mp_count_bits(mm) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mm)) {
        err = MP_VAL;
    }
    else if (mp_iseven(pm)) {
        err = MP_VAL;
    }
    else if (mp_iseven(qm)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 48 * 11, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = a + 96 * 2;
        q = p + 48;
        qi = dq = dp = q + 48;
        tmpa = qi + 48;
        tmpb = tmpa + 96;
        r = a;

        sp_3072_from_bin(a, 96, in, inLen);
        sp_3072_from_mp(p, 48, pm);
        sp_3072_from_mp(q, 48, qm);
        sp_3072_from_mp(dp, 48, dpm);

        err = sp_3072_mod_exp_48(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(dq, 48, dqm);
        err = sp_3072_mod_exp_48(tmpb, a, dq, 1536, q, 1);
    }

    if (err == MP_OKAY) {
        c = sp_3072_sub_in_place_48(tmpa, tmpb);
        c += sp_3072_cond_add_48(tmpa, tmpa, p, c);
        sp_3072_cond_add_48(tmpa, tmpa, p, c);

        sp_3072_from_mp(qi, 48, qim);
        sp_3072_mul_48(tmpa, tmpa, qi);
        err = sp_3072_mod_48(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_3072_mul_48(tmpa, q, tmpa);
        XMEMSET(&tmpb[48], 0, sizeof(sp_digit) * 48);
        sp_3072_add_96(r, tmpb, tmpa);

        sp_3072_to_bin_96(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 48 * 11);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
    #endif
    }
#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
    return err;
}
#endif /* WOLFSSL_RSA_PUBLIC_ONLY */
#endif /* WOLFSSL_HAVE_SP_RSA */
#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (3072 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 32
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 96);
        r->used = 96;
        mp_clamp(r);
#elif DIGIT_BIT < 32
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 96; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 96; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 32 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 32 - s;
            }
            else {
                s += 32;
            }
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[192];
    sp_digit e[96];
    sp_digit m[96];
    sp_digit* r = b;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072) {
        err = MP_READ_E;
    }
    else if (expBits > 3072) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(b, 96, base);
        sp_3072_from_mp(e, 96, exp);
        sp_3072_from_mp(m, 96, mod);

        err = sp_3072_mod_exp_96(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

#ifdef WOLFSSL_HAVE_SP_DH

#ifdef HAVE_FFDHE_3072
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_3072_lshift_96(sp_digit* r_p, const sp_digit* a_p, byte n_p)
#else
static void sp_3072_lshift_96(sp_digit* r, const sp_digit* a, byte n)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register byte n __asm__ ("r2") = (byte)n_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "RSB	r7, %[n], #0x1f\n\t"
        "LDR	r5, [%[a], #380]\n\t"
        "LSR	r6, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r6, r6, r7\n\t"
        "LDR	r4, [%[a], #376]\n\t"
        "STR	r6, [%[r], #384]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #372]\n\t"
        "STR	r5, [%[r], #380]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #368]\n\t"
        "STR	r4, [%[r], #376]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #364]\n\t"
        "STR	r6, [%[r], #372]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #360]\n\t"
        "STR	r5, [%[r], #368]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #356]\n\t"
        "STR	r4, [%[r], #364]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #352]\n\t"
        "STR	r6, [%[r], #360]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #348]\n\t"
        "STR	r5, [%[r], #356]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #344]\n\t"
        "STR	r4, [%[r], #352]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #340]\n\t"
        "STR	r6, [%[r], #348]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #336]\n\t"
        "STR	r5, [%[r], #344]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #332]\n\t"
        "STR	r4, [%[r], #340]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #328]\n\t"
        "STR	r6, [%[r], #336]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #324]\n\t"
        "STR	r5, [%[r], #332]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #320]\n\t"
        "STR	r4, [%[r], #328]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #316]\n\t"
        "STR	r6, [%[r], #324]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #312]\n\t"
        "STR	r5, [%[r], #320]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #308]\n\t"
        "STR	r4, [%[r], #316]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #304]\n\t"
        "STR	r6, [%[r], #312]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #300]\n\t"
        "STR	r5, [%[r], #308]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #296]\n\t"
        "STR	r4, [%[r], #304]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #292]\n\t"
        "STR	r6, [%[r], #300]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #288]\n\t"
        "STR	r5, [%[r], #296]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #284]\n\t"
        "STR	r4, [%[r], #292]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #280]\n\t"
        "STR	r6, [%[r], #288]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #276]\n\t"
        "STR	r5, [%[r], #284]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #272]\n\t"
        "STR	r4, [%[r], #280]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #268]\n\t"
        "STR	r6, [%[r], #276]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #264]\n\t"
        "STR	r5, [%[r], #272]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #260]\n\t"
        "STR	r4, [%[r], #268]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #256]\n\t"
        "STR	r6, [%[r], #264]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #252]\n\t"
        "STR	r5, [%[r], #260]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #248]\n\t"
        "STR	r4, [%[r], #256]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #244]\n\t"
        "STR	r6, [%[r], #252]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #240]\n\t"
        "STR	r5, [%[r], #248]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #236]\n\t"
        "STR	r4, [%[r], #244]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #232]\n\t"
        "STR	r6, [%[r], #240]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #228]\n\t"
        "STR	r5, [%[r], #236]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #224]\n\t"
        "STR	r4, [%[r], #232]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #220]\n\t"
        "STR	r6, [%[r], #228]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #216]\n\t"
        "STR	r5, [%[r], #224]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #212]\n\t"
        "STR	r4, [%[r], #220]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #208]\n\t"
        "STR	r6, [%[r], #216]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #204]\n\t"
        "STR	r5, [%[r], #212]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #200]\n\t"
        "STR	r4, [%[r], #208]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #196]\n\t"
        "STR	r6, [%[r], #204]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #192]\n\t"
        "STR	r5, [%[r], #200]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #188]\n\t"
        "STR	r4, [%[r], #196]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #184]\n\t"
        "STR	r6, [%[r], #192]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #180]\n\t"
        "STR	r5, [%[r], #188]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #176]\n\t"
        "STR	r4, [%[r], #184]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #172]\n\t"
        "STR	r6, [%[r], #180]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #168]\n\t"
        "STR	r5, [%[r], #176]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #164]\n\t"
        "STR	r4, [%[r], #172]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #160]\n\t"
        "STR	r6, [%[r], #168]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #156]\n\t"
        "STR	r5, [%[r], #164]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #152]\n\t"
        "STR	r4, [%[r], #160]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #148]\n\t"
        "STR	r6, [%[r], #156]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #144]\n\t"
        "STR	r5, [%[r], #152]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #140]\n\t"
        "STR	r4, [%[r], #148]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #136]\n\t"
        "STR	r6, [%[r], #144]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #132]\n\t"
        "STR	r5, [%[r], #140]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #128]\n\t"
        "STR	r4, [%[r], #136]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #124]\n\t"
        "STR	r6, [%[r], #132]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #120]\n\t"
        "STR	r5, [%[r], #128]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #116]\n\t"
        "STR	r4, [%[r], #124]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #112]\n\t"
        "STR	r6, [%[r], #120]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #108]\n\t"
        "STR	r5, [%[r], #116]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #104]\n\t"
        "STR	r4, [%[r], #112]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #100]\n\t"
        "STR	r6, [%[r], #108]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #96]\n\t"
        "STR	r5, [%[r], #104]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #92]\n\t"
        "STR	r4, [%[r], #100]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #88]\n\t"
        "STR	r6, [%[r], #96]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #84]\n\t"
        "STR	r5, [%[r], #92]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #80]\n\t"
        "STR	r4, [%[r], #88]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #76]\n\t"
        "STR	r6, [%[r], #84]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #72]\n\t"
        "STR	r5, [%[r], #80]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #68]\n\t"
        "STR	r4, [%[r], #76]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #64]\n\t"
        "STR	r6, [%[r], #72]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #60]\n\t"
        "STR	r5, [%[r], #68]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #56]\n\t"
        "STR	r4, [%[r], #64]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #52]\n\t"
        "STR	r6, [%[r], #60]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #48]\n\t"
        "STR	r5, [%[r], #56]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #44]\n\t"
        "STR	r4, [%[r], #52]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #40]\n\t"
        "STR	r6, [%[r], #48]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #36]\n\t"
        "STR	r5, [%[r], #44]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #32]\n\t"
        "STR	r4, [%[r], #40]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #28]\n\t"
        "STR	r6, [%[r], #36]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #24]\n\t"
        "STR	r5, [%[r], #32]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #20]\n\t"
        "STR	r4, [%[r], #28]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #16]\n\t"
        "STR	r6, [%[r], #24]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #12]\n\t"
        "STR	r5, [%[r], #20]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #8]\n\t"
        "STR	r4, [%[r], #16]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #4]\n\t"
        "STR	r6, [%[r], #12]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a]]\n\t"
        "STR	r5, [%[r], #8]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "STR	r6, [%[r]]\n\t"
        "STR	r4, [%[r], #4]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
        :
        : "memory", "r4", "r5", "r6", "r3", "r7", "cc"
    );
}

/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even.
 */
static int sp_3072_mod_exp_2_96(sp_digit* r, const sp_digit* e, int bits,
        const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[289];
#endif
    sp_digit* norm = NULL;
    sp_digit* tmp = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit o;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 289, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        tmp = td + 192;

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_96(norm, m);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 5;
        if (c == 32) {
            c = 27;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        sp_3072_lshift_96(r, norm, y);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 27);
                n <<= 5;
                c = 27;
            }
            else if (c < 5) {
                y = (byte)(n >> 27);
                n = e[i--];
                c = 5 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }

            sp_3072_mont_sqr_96(r, r, m, mp);
            sp_3072_mont_sqr_96(r, r, m, mp);
            sp_3072_mont_sqr_96(r, r, m, mp);
            sp_3072_mont_sqr_96(r, r, m, mp);
            sp_3072_mont_sqr_96(r, r, m, mp);

            sp_3072_lshift_96(r, r, y);
            sp_3072_mul_d_96(tmp, norm, r[96]);
            r[96] = 0;
            o = sp_3072_add_96(r, r, tmp);
            sp_3072_cond_sub_96(r, r, m, (sp_digit)0 - o);
        }

        XMEMSET(&r[96], 0, sizeof(sp_digit) * 96U);
        sp_3072_mont_reduce_96(r, m, mp);

        mask = (sp_digit)0 - (sp_3072_cmp_96(r, m) >= 0);
        sp_3072_cond_sub_96(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* HAVE_FFDHE_3072 */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 384 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
    const mp_int* mod, byte* out, word32* outLen)
{
    int err = MP_OKAY;
    sp_digit b[192];
    sp_digit e[96];
    sp_digit m[96];
    sp_digit* r = b;
    word32 i;

    if (mp_count_bits(base) > 3072) {
        err = MP_READ_E;
    }
    else if (expLen > 384) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(b, 96, base);
        sp_3072_from_bin(e, 96, exp, expLen);
        sp_3072_from_mp(m, 96, mod);

    #ifdef HAVE_FFDHE_3072
        if (base->used == 1 && base->dp[0] == 2 && m[95] == (sp_digit)-1)
            err = sp_3072_mod_exp_2_96(r, e, expLen * 8, m);
        else
    #endif
            err = sp_3072_mod_exp_96(r, b, e, expLen * 8, m, 0);

    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_96(r, out);
        *outLen = 384;
        for (i=0; i<384 && out[i] == 0; i++) {
            /* Search for first non-zero. */
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);

    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[96];
    sp_digit e[48];
    sp_digit m[48];
    sp_digit* r = b;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 1536) {
        err = MP_READ_E;
    }
    else if (expBits > 1536) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 1536) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(b, 48, base);
        sp_3072_from_mp(e, 48, exp);
        sp_3072_from_mp(m, 48, mod);

        err = sp_3072_mod_exp_48(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        XMEMSET(r + 48, 0, sizeof(*r) * 48U);
        err = sp_3072_to_mp(r, res);
        res->used = mod->used;
        mp_clamp(res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */

#endif /* !WOLFSSL_SP_NO_3072 */

#ifdef WOLFSSL_SP_4096
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j;
    byte* d;

    j = 0;
    for (i = n - 1; i >= 3; i -= 4) {
        r[j]  = ((sp_digit)a[i - 0] <<  0) |
                ((sp_digit)a[i - 1] <<  8) |
                ((sp_digit)a[i - 2] << 16) |
                ((sp_digit)a[i - 3] << 24);
        j++;
    }

    if (i >= 0) {
        r[j] = 0;

        d = (byte*)(r + j);
#ifdef BIG_ENDIAN_ORDER
        switch (i) {
            case 2: d[1] = *(a++); //fallthrough
            case 1: d[2] = *(a++); //fallthrough
            case 0: d[3] = *a    ; //fallthrough
        }
#else
        switch (i) {
            case 2: d[2] = a[2]; //fallthrough
            case 1: d[1] = a[1]; //fallthrough
            case 0: d[0] = a[0]; //fallthrough
        }
#endif
        j++;
    }

    for (; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 32
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 31);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 31);
    }
#elif DIGIT_BIT > 32
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0xffffffff;
        s = 32U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 32U) <= (word32)DIGIT_BIT) {
            s += 32U;
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 32) {
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 32 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 512
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_4096_to_bin_128(sp_digit* r, byte* a)
{
    int i;
    int j = 0;

    for (i = 127; i >= 0; i--) {
        a[j++] = r[i] >> 24;
        a[j++] = r[i] >> 16;
        a[j++] = r[i] >> 8;
        a[j++] = r[i] >> 0;
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH)
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_4096_norm_128(a)

#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_4096_norm_128(a)

#ifndef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_4096_sub_in_place_128(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SUBS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADDS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "ADCS	r3, r3, r7\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "MOV	%[r], #0x0\n\t"
        "ADC	%[r], %[r], #0x0\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[128];
    sp_digit a1[64];
    sp_digit b1[64];
    sp_digit* z2 = r + 128;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_2048_add_64(a1, a, &a[64]);
    cb = sp_2048_add_64(b1, b, &b[64]);
    u  = ca & cb;

    sp_2048_mul_64(z2, &a[64], &b[64]);
    sp_2048_mul_64(z0, a, b);
    sp_2048_mul_64(z1, a1, b1);

    u += sp_4096_sub_in_place_128(z1, z0);
    u += sp_4096_sub_in_place_128(z1, z2);
    sp_2048_mask_64(a1, a1, 0 - cb);
    u += sp_2048_add_64(z1 + 64, z1 + 64, a1);
    sp_2048_mask_64(b1, b1, 0 - ca);
    u += sp_2048_add_64(z1 + 64, z1 + 64, b1);

    u += sp_4096_add_128(r + 64, r + 64, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (64 - 1));
    a1[0] = u;
    (void)sp_2048_add_64(r + 192, r + 192, a1);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 128;
    sp_digit z1[128];
    sp_digit* a1 = z1;
    sp_digit zero[64];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 64);

    mask = sp_2048_sub_64(a1, a, &a[64]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_2048_sub_64(a1, p1, p2);

    sp_2048_sqr_64(z2, &a[64]);
    sp_2048_sqr_64(z0, a);
    sp_2048_sqr_64(z1, a1);

    u = 0;
    u -= sp_4096_sub_in_place_128(z1, z2);
    u -= sp_4096_sub_in_place_128(z1, z0);
    u += sp_4096_sub_in_place_128(r + 64, z1);
    zero[0] = u;
    (void)sp_2048_add_64(r + 192, r + 192, zero);
}

#endif /* !WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r3, #0x0\n\t"
        "ADD	r12, %[a], #0x200\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_add_128_word:\n\t"
#else
    "L_sp_4096_add_128_word_%=:\n\t"
#endif
        "ADDS	r3, r3, #0xffffffff\n\t"
        "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
        "ADCS	r4, r4, r8\n\t"
        "ADCS	r5, r5, r9\n\t"
        "ADCS	r6, r6, r10\n\t"
        "ADCS	r7, r7, r11\n\t"
        "STM	%[r]!, {r4, r5, r6, r7}\n\t"
        "MOV	r4, #0x0\n\t"
        "ADC	r3, r4, #0x0\n\t"
        "CMP	%[a], r12\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_4096_add_128_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_4096_add_128_word\n\t"
#else
        "BNE.N	L_sp_4096_add_128_word_%=\n\t"
#endif
        "MOV	%[r], r3\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_4096_sub_in_place_128(sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r10, #0x0\n\t"
        "ADD	r11, %[a], #0x200\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_sub_in_pkace_128_word:\n\t"
#else
    "L_sp_4096_sub_in_pkace_128_word_%=:\n\t"
#endif
        "RSBS	r10, r10, #0x0\n\t"
        "LDM	%[a], {r2, r3, r4, r5}\n\t"
        "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
        "SBCS	r2, r2, r6\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "STM	%[a]!, {r2, r3, r4, r5}\n\t"
        "SBC	r10, r10, r10\n\t"
        "CMP	%[a], r11\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_4096_sub_in_pkace_128_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_4096_sub_in_pkace_128_word\n\t"
#else
        "BNE.N	L_sp_4096_sub_in_pkace_128_word_%=\n\t"
#endif
        "MOV	%[a], r10\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc"
    );
    return (word32)(size_t)a;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x400\n\t"
        "LDR	lr, [%[a]]\n\t"
        "LDR	r11, [%[b]]\n\t"
        "UMULL	r8, r6, lr, r11\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mul_128_outer:\n\t"
#else
    "L_sp_4096_mul_128_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0x1fc\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mul_128_inner:\n\t"
#else
    "L_sp_4096_mul_128_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "LDR	lr, [%[a], r4]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_4096_mul_128_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_4096_mul_128_inner_done\n\t"
#else
        "BGT.N	L_sp_4096_mul_128_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_4096_mul_128_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_4096_mul_128_inner\n\t"
#else
        "BLT.N	L_sp_4096_mul_128_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mul_128_inner_done:\n\t"
#else
    "L_sp_4096_mul_128_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x3f4\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_4096_mul_128_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_4096_mul_128_outer\n\t"
#else
        "BLE.N	L_sp_4096_mul_128_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #508]\n\t"
        "LDR	r11, [%[b], #508]\n\t"
        "UMLAL	r6, r7, lr, r11\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mul_128_store:\n\t"
#else
    "L_sp_4096_mul_128_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_4096_mul_128_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_4096_mul_128_store\n\t"
#else
        "BGT.N	L_sp_4096_mul_128_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p)
#else
static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x400\n\t"
        "LDR	lr, [%[a]]\n\t"
        "UMULL	r8, r6, lr, lr\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_sqr_128_outer:\n\t"
#else
    "L_sp_4096_sqr_128_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0x1fc\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_sqr_128_inner:\n\t"
#else
    "L_sp_4096_sqr_128_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[a], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_4096_sqr_128_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_4096_sqr_128_inner_done\n\t"
#else
        "BGT.N	L_sp_4096_sqr_128_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_4096_sqr_128_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_4096_sqr_128_inner\n\t"
#else
        "BLT.N	L_sp_4096_sqr_128_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "UMULL	r9, r10, lr, lr\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_sqr_128_inner_done:\n\t"
#else
    "L_sp_4096_sqr_128_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x3f4\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_4096_sqr_128_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_4096_sqr_128_outer\n\t"
#else
        "BLE.N	L_sp_4096_sqr_128_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #508]\n\t"
        "UMLAL	r6, r7, lr, lr\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_sqr_128_store:\n\t"
#else
    "L_sp_4096_sqr_128_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_4096_sqr_128_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_4096_sqr_128_store\n\t"
#else
        "BGT.N	L_sp_4096_sqr_128_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
/* Calculate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */

    /* rho = -1/m mod b */
    *rho = (sp_digit)0 - x;
}

#ifdef WOLFSSL_SP_SMALL
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDR	r8, [%[a]]\n\t"
        "UMULL	r5, r3, %[b], r8\n\t"
        "MOV	r4, #0x0\n\t"
        "STR	r5, [%[r]]\n\t"
        "MOV	r5, #0x0\n\t"
        "MOV	r9, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mul_d_128_word:\n\t"
#else
    "L_sp_4096_mul_d_128_word_%=:\n\t"
#endif
        /* A[i] * B */
        "LDR	r8, [%[a], r9]\n\t"
        "UMULL	r6, r7, %[b], r8\n\t"
        "ADDS	r3, r3, r6\n\t"
        "ADCS	r4, r4, r7\n\t"
        "ADC	r5, r5, #0x0\n\t"
        "STR	r3, [%[r], r9]\n\t"
        "MOV	r3, r4\n\t"
        "MOV	r4, r5\n\t"
        "MOV	r5, #0x0\n\t"
        "ADD	r9, r9, #0x4\n\t"
        "CMP	r9, #0x200\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_4096_mul_d_128_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_4096_mul_d_128_word\n\t"
#else
        "BLT.N	L_sp_4096_mul_d_128_word_%=\n\t"
#endif
        "STR	r3, [%[r], #512]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
}

#else
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p)
#else
static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, sp_digit b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register sp_digit b __asm__ ("r2") = (sp_digit)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* A[0] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMULL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[1] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[2] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[3] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[4] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[5] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[6] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[7] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[8] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[9] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[10] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[11] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[12] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[13] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[14] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[15] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[16] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[17] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[18] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[19] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[20] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[21] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[22] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[23] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[24] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[25] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[26] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[27] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[28] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[29] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[30] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[31] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[32] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[33] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[34] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[35] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[36] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[37] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[38] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[39] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[40] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[41] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[42] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[43] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[44] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[45] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[46] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[47] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[48] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[49] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[50] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[51] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[52] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[53] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[54] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[55] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[56] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[57] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[58] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[59] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[60] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[61] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[62] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[63] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[64] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[65] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[66] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[67] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[68] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[69] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[70] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[71] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[72] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[73] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[74] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[75] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[76] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[77] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[78] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[79] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[80] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[81] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[82] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[83] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[84] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[85] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[86] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[87] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[88] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[89] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[90] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[91] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[92] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[93] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[94] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[95] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[96] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[97] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[98] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[99] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[100] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[101] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[102] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[103] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[104] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[105] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[106] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[107] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[108] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[109] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[110] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[111] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[112] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[113] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[114] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[115] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[116] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[117] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[118] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[119] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[120] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[121] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[122] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[123] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[124] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "MOV	r3, #0x0\n\t"
        /* A[125] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r5, r3, %[b], r8\n\t"
        "STM	%[r]!, {r5}\n\t"
        "MOV	r4, #0x0\n\t"
        /* A[126] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r3, r4, %[b], r8\n\t"
        "STM	%[r]!, {r3}\n\t"
        "MOV	r5, #0x0\n\t"
        /* A[127] * B */
        "LDM	%[a]!, {r8}\n\t"
        "UMLAL	r4, r5, %[b], r8\n\t"
        "STM	%[r]!, {r4}\n\t"
        "STR	r5, [%[r]]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 4096 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_4096_mont_norm_128(sp_digit* r, const sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 128);

    /* r = 2^n mod m */
    sp_4096_sub_in_place_128(r, m);
}

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_SP_SMALL
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r8, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_cond_sub_128_words:\n\t"
#else
    "L_sp_4096_cond_sub_128_words_%=:\n\t"
#endif
        "SUBS	r4, r8, r4\n\t"
        "LDR	r6, [%[a], r5]\n\t"
        "LDR	r7, [%[b], r5]\n\t"
        "AND	r7, r7, %[m]\n\t"
        "SBCS	r6, r6, r7\n\t"
        "SBC	r4, r8, r8\n\t"
        "STR	r6, [%[r], r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x200\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_4096_cond_sub_128_words_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_4096_cond_sub_128_words\n\t"
#else
        "BLT.N	L_sp_4096_cond_sub_128_words_%=\n\t"
#endif
        "MOV	%[r], r4\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r5, #0x0\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SUBS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "SBCS	r6, r6, r8\n\t"
        "SBCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "SBC	%[r], r5, r5\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_NO_UMAAL
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 4096 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	lr, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r11, #0x0\n\t"
        "MOV	r3, #0x0\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[a], #4]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mont_reduce_128_word:\n\t"
#else
    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	r10, %[mp], r4\n\t"
        /* a[i+0] += m[0] * mu */
        "MOV	r7, #0x0\n\t"
        "UMLAL	r4, r7, r10, lr\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r9, [%[m], #4]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r5, r6, r10, r9\n\t"
        "MOV	r4, r5\n\t"
        "ADDS	r4, r4, r7\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r9, [%[m], #8]\n\t"
        "LDR	r5, [%[a], #8]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r5, r7, r10, r9\n\t"
        "ADDS	r5, r5, r6\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r9, [%[m], #12]\n\t"
        "LDR	r12, [%[a], #12]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #12]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r9, [%[m], #16]\n\t"
        "LDR	r12, [%[a], #16]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #16]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r9, [%[m], #20]\n\t"
        "LDR	r12, [%[a], #20]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #20]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r9, [%[m], #24]\n\t"
        "LDR	r12, [%[a], #24]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #24]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r9, [%[m], #28]\n\t"
        "LDR	r12, [%[a], #28]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #28]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r9, [%[m], #32]\n\t"
        "LDR	r12, [%[a], #32]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #32]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r9, [%[m], #36]\n\t"
        "LDR	r12, [%[a], #36]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #36]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r9, [%[m], #40]\n\t"
        "LDR	r12, [%[a], #40]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #40]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r9, [%[m], #44]\n\t"
        "LDR	r12, [%[a], #44]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #44]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r9, [%[m], #48]\n\t"
        "LDR	r12, [%[a], #48]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #48]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r9, [%[m], #52]\n\t"
        "LDR	r12, [%[a], #52]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #52]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r9, [%[m], #56]\n\t"
        "LDR	r12, [%[a], #56]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #56]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r9, [%[m], #60]\n\t"
        "LDR	r12, [%[a], #60]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #60]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r9, [%[m], #64]\n\t"
        "LDR	r12, [%[a], #64]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #64]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r9, [%[m], #68]\n\t"
        "LDR	r12, [%[a], #68]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #68]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r9, [%[m], #72]\n\t"
        "LDR	r12, [%[a], #72]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #72]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r9, [%[m], #76]\n\t"
        "LDR	r12, [%[a], #76]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #76]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r9, [%[m], #80]\n\t"
        "LDR	r12, [%[a], #80]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #80]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r9, [%[m], #84]\n\t"
        "LDR	r12, [%[a], #84]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #84]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r9, [%[m], #88]\n\t"
        "LDR	r12, [%[a], #88]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #88]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r9, [%[m], #92]\n\t"
        "LDR	r12, [%[a], #92]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #92]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r9, [%[m], #96]\n\t"
        "LDR	r12, [%[a], #96]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #96]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r9, [%[m], #100]\n\t"
        "LDR	r12, [%[a], #100]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #100]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r9, [%[m], #104]\n\t"
        "LDR	r12, [%[a], #104]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #104]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r9, [%[m], #108]\n\t"
        "LDR	r12, [%[a], #108]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #108]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r9, [%[m], #112]\n\t"
        "LDR	r12, [%[a], #112]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #112]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r9, [%[m], #116]\n\t"
        "LDR	r12, [%[a], #116]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #116]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r9, [%[m], #120]\n\t"
        "LDR	r12, [%[a], #120]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #120]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r9, [%[m], #124]\n\t"
        "LDR	r12, [%[a], #124]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #124]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+32] += m[32] * mu */
        "LDR	r9, [%[m], #128]\n\t"
        "LDR	r12, [%[a], #128]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #128]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+33] += m[33] * mu */
        "LDR	r9, [%[m], #132]\n\t"
        "LDR	r12, [%[a], #132]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #132]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+34] += m[34] * mu */
        "LDR	r9, [%[m], #136]\n\t"
        "LDR	r12, [%[a], #136]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #136]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+35] += m[35] * mu */
        "LDR	r9, [%[m], #140]\n\t"
        "LDR	r12, [%[a], #140]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #140]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+36] += m[36] * mu */
        "LDR	r9, [%[m], #144]\n\t"
        "LDR	r12, [%[a], #144]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #144]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+37] += m[37] * mu */
        "LDR	r9, [%[m], #148]\n\t"
        "LDR	r12, [%[a], #148]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #148]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+38] += m[38] * mu */
        "LDR	r9, [%[m], #152]\n\t"
        "LDR	r12, [%[a], #152]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #152]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+39] += m[39] * mu */
        "LDR	r9, [%[m], #156]\n\t"
        "LDR	r12, [%[a], #156]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #156]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+40] += m[40] * mu */
        "LDR	r9, [%[m], #160]\n\t"
        "LDR	r12, [%[a], #160]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #160]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+41] += m[41] * mu */
        "LDR	r9, [%[m], #164]\n\t"
        "LDR	r12, [%[a], #164]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #164]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+42] += m[42] * mu */
        "LDR	r9, [%[m], #168]\n\t"
        "LDR	r12, [%[a], #168]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #168]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+43] += m[43] * mu */
        "LDR	r9, [%[m], #172]\n\t"
        "LDR	r12, [%[a], #172]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #172]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+44] += m[44] * mu */
        "LDR	r9, [%[m], #176]\n\t"
        "LDR	r12, [%[a], #176]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #176]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+45] += m[45] * mu */
        "LDR	r9, [%[m], #180]\n\t"
        "LDR	r12, [%[a], #180]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #180]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+46] += m[46] * mu */
        "LDR	r9, [%[m], #184]\n\t"
        "LDR	r12, [%[a], #184]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #184]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+47] += m[47] * mu */
        "LDR	r9, [%[m], #188]\n\t"
        "LDR	r12, [%[a], #188]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #188]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+48] += m[48] * mu */
        "LDR	r9, [%[m], #192]\n\t"
        "LDR	r12, [%[a], #192]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #192]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+49] += m[49] * mu */
        "LDR	r9, [%[m], #196]\n\t"
        "LDR	r12, [%[a], #196]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #196]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+50] += m[50] * mu */
        "LDR	r9, [%[m], #200]\n\t"
        "LDR	r12, [%[a], #200]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #200]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+51] += m[51] * mu */
        "LDR	r9, [%[m], #204]\n\t"
        "LDR	r12, [%[a], #204]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #204]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+52] += m[52] * mu */
        "LDR	r9, [%[m], #208]\n\t"
        "LDR	r12, [%[a], #208]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #208]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+53] += m[53] * mu */
        "LDR	r9, [%[m], #212]\n\t"
        "LDR	r12, [%[a], #212]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #212]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+54] += m[54] * mu */
        "LDR	r9, [%[m], #216]\n\t"
        "LDR	r12, [%[a], #216]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #216]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+55] += m[55] * mu */
        "LDR	r9, [%[m], #220]\n\t"
        "LDR	r12, [%[a], #220]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #220]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+56] += m[56] * mu */
        "LDR	r9, [%[m], #224]\n\t"
        "LDR	r12, [%[a], #224]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #224]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+57] += m[57] * mu */
        "LDR	r9, [%[m], #228]\n\t"
        "LDR	r12, [%[a], #228]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #228]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+58] += m[58] * mu */
        "LDR	r9, [%[m], #232]\n\t"
        "LDR	r12, [%[a], #232]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #232]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+59] += m[59] * mu */
        "LDR	r9, [%[m], #236]\n\t"
        "LDR	r12, [%[a], #236]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #236]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+60] += m[60] * mu */
        "LDR	r9, [%[m], #240]\n\t"
        "LDR	r12, [%[a], #240]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #240]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+61] += m[61] * mu */
        "LDR	r9, [%[m], #244]\n\t"
        "LDR	r12, [%[a], #244]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #244]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+62] += m[62] * mu */
        "LDR	r9, [%[m], #248]\n\t"
        "LDR	r12, [%[a], #248]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #248]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+63] += m[63] * mu */
        "LDR	r9, [%[m], #252]\n\t"
        "LDR	r12, [%[a], #252]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #252]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+64] += m[64] * mu */
        "LDR	r9, [%[m], #256]\n\t"
        "LDR	r12, [%[a], #256]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #256]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+65] += m[65] * mu */
        "LDR	r9, [%[m], #260]\n\t"
        "LDR	r12, [%[a], #260]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #260]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+66] += m[66] * mu */
        "LDR	r9, [%[m], #264]\n\t"
        "LDR	r12, [%[a], #264]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #264]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+67] += m[67] * mu */
        "LDR	r9, [%[m], #268]\n\t"
        "LDR	r12, [%[a], #268]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #268]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+68] += m[68] * mu */
        "LDR	r9, [%[m], #272]\n\t"
        "LDR	r12, [%[a], #272]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #272]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+69] += m[69] * mu */
        "LDR	r9, [%[m], #276]\n\t"
        "LDR	r12, [%[a], #276]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #276]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+70] += m[70] * mu */
        "LDR	r9, [%[m], #280]\n\t"
        "LDR	r12, [%[a], #280]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #280]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+71] += m[71] * mu */
        "LDR	r9, [%[m], #284]\n\t"
        "LDR	r12, [%[a], #284]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #284]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+72] += m[72] * mu */
        "LDR	r9, [%[m], #288]\n\t"
        "LDR	r12, [%[a], #288]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #288]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+73] += m[73] * mu */
        "LDR	r9, [%[m], #292]\n\t"
        "LDR	r12, [%[a], #292]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #292]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+74] += m[74] * mu */
        "LDR	r9, [%[m], #296]\n\t"
        "LDR	r12, [%[a], #296]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #296]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+75] += m[75] * mu */
        "LDR	r9, [%[m], #300]\n\t"
        "LDR	r12, [%[a], #300]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #300]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+76] += m[76] * mu */
        "LDR	r9, [%[m], #304]\n\t"
        "LDR	r12, [%[a], #304]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #304]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+77] += m[77] * mu */
        "LDR	r9, [%[m], #308]\n\t"
        "LDR	r12, [%[a], #308]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #308]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+78] += m[78] * mu */
        "LDR	r9, [%[m], #312]\n\t"
        "LDR	r12, [%[a], #312]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #312]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+79] += m[79] * mu */
        "LDR	r9, [%[m], #316]\n\t"
        "LDR	r12, [%[a], #316]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #316]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+80] += m[80] * mu */
        "LDR	r9, [%[m], #320]\n\t"
        "LDR	r12, [%[a], #320]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #320]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+81] += m[81] * mu */
        "LDR	r9, [%[m], #324]\n\t"
        "LDR	r12, [%[a], #324]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #324]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+82] += m[82] * mu */
        "LDR	r9, [%[m], #328]\n\t"
        "LDR	r12, [%[a], #328]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #328]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+83] += m[83] * mu */
        "LDR	r9, [%[m], #332]\n\t"
        "LDR	r12, [%[a], #332]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #332]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+84] += m[84] * mu */
        "LDR	r9, [%[m], #336]\n\t"
        "LDR	r12, [%[a], #336]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #336]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+85] += m[85] * mu */
        "LDR	r9, [%[m], #340]\n\t"
        "LDR	r12, [%[a], #340]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #340]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+86] += m[86] * mu */
        "LDR	r9, [%[m], #344]\n\t"
        "LDR	r12, [%[a], #344]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #344]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+87] += m[87] * mu */
        "LDR	r9, [%[m], #348]\n\t"
        "LDR	r12, [%[a], #348]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #348]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+88] += m[88] * mu */
        "LDR	r9, [%[m], #352]\n\t"
        "LDR	r12, [%[a], #352]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #352]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+89] += m[89] * mu */
        "LDR	r9, [%[m], #356]\n\t"
        "LDR	r12, [%[a], #356]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #356]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+90] += m[90] * mu */
        "LDR	r9, [%[m], #360]\n\t"
        "LDR	r12, [%[a], #360]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #360]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+91] += m[91] * mu */
        "LDR	r9, [%[m], #364]\n\t"
        "LDR	r12, [%[a], #364]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #364]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+92] += m[92] * mu */
        "LDR	r9, [%[m], #368]\n\t"
        "LDR	r12, [%[a], #368]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #368]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+93] += m[93] * mu */
        "LDR	r9, [%[m], #372]\n\t"
        "LDR	r12, [%[a], #372]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #372]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+94] += m[94] * mu */
        "LDR	r9, [%[m], #376]\n\t"
        "LDR	r12, [%[a], #376]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #376]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+95] += m[95] * mu */
        "LDR	r9, [%[m], #380]\n\t"
        "LDR	r12, [%[a], #380]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #380]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+96] += m[96] * mu */
        "LDR	r9, [%[m], #384]\n\t"
        "LDR	r12, [%[a], #384]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #384]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+97] += m[97] * mu */
        "LDR	r9, [%[m], #388]\n\t"
        "LDR	r12, [%[a], #388]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #388]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+98] += m[98] * mu */
        "LDR	r9, [%[m], #392]\n\t"
        "LDR	r12, [%[a], #392]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #392]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+99] += m[99] * mu */
        "LDR	r9, [%[m], #396]\n\t"
        "LDR	r12, [%[a], #396]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #396]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+100] += m[100] * mu */
        "LDR	r9, [%[m], #400]\n\t"
        "LDR	r12, [%[a], #400]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #400]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+101] += m[101] * mu */
        "LDR	r9, [%[m], #404]\n\t"
        "LDR	r12, [%[a], #404]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #404]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+102] += m[102] * mu */
        "LDR	r9, [%[m], #408]\n\t"
        "LDR	r12, [%[a], #408]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #408]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+103] += m[103] * mu */
        "LDR	r9, [%[m], #412]\n\t"
        "LDR	r12, [%[a], #412]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #412]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+104] += m[104] * mu */
        "LDR	r9, [%[m], #416]\n\t"
        "LDR	r12, [%[a], #416]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #416]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+105] += m[105] * mu */
        "LDR	r9, [%[m], #420]\n\t"
        "LDR	r12, [%[a], #420]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #420]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+106] += m[106] * mu */
        "LDR	r9, [%[m], #424]\n\t"
        "LDR	r12, [%[a], #424]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #424]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+107] += m[107] * mu */
        "LDR	r9, [%[m], #428]\n\t"
        "LDR	r12, [%[a], #428]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #428]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+108] += m[108] * mu */
        "LDR	r9, [%[m], #432]\n\t"
        "LDR	r12, [%[a], #432]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #432]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+109] += m[109] * mu */
        "LDR	r9, [%[m], #436]\n\t"
        "LDR	r12, [%[a], #436]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #436]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+110] += m[110] * mu */
        "LDR	r9, [%[m], #440]\n\t"
        "LDR	r12, [%[a], #440]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #440]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+111] += m[111] * mu */
        "LDR	r9, [%[m], #444]\n\t"
        "LDR	r12, [%[a], #444]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #444]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+112] += m[112] * mu */
        "LDR	r9, [%[m], #448]\n\t"
        "LDR	r12, [%[a], #448]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #448]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+113] += m[113] * mu */
        "LDR	r9, [%[m], #452]\n\t"
        "LDR	r12, [%[a], #452]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #452]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+114] += m[114] * mu */
        "LDR	r9, [%[m], #456]\n\t"
        "LDR	r12, [%[a], #456]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #456]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+115] += m[115] * mu */
        "LDR	r9, [%[m], #460]\n\t"
        "LDR	r12, [%[a], #460]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #460]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+116] += m[116] * mu */
        "LDR	r9, [%[m], #464]\n\t"
        "LDR	r12, [%[a], #464]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #464]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+117] += m[117] * mu */
        "LDR	r9, [%[m], #468]\n\t"
        "LDR	r12, [%[a], #468]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #468]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+118] += m[118] * mu */
        "LDR	r9, [%[m], #472]\n\t"
        "LDR	r12, [%[a], #472]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #472]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+119] += m[119] * mu */
        "LDR	r9, [%[m], #476]\n\t"
        "LDR	r12, [%[a], #476]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #476]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+120] += m[120] * mu */
        "LDR	r9, [%[m], #480]\n\t"
        "LDR	r12, [%[a], #480]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #480]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+121] += m[121] * mu */
        "LDR	r9, [%[m], #484]\n\t"
        "LDR	r12, [%[a], #484]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #484]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+122] += m[122] * mu */
        "LDR	r9, [%[m], #488]\n\t"
        "LDR	r12, [%[a], #488]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #488]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+123] += m[123] * mu */
        "LDR	r9, [%[m], #492]\n\t"
        "LDR	r12, [%[a], #492]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #492]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+124] += m[124] * mu */
        "LDR	r9, [%[m], #496]\n\t"
        "LDR	r12, [%[a], #496]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #496]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+125] += m[125] * mu */
        "LDR	r9, [%[m], #500]\n\t"
        "LDR	r12, [%[a], #500]\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r12, r6, r10, r9\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #500]\n\t"
        "ADC	r6, r6, #0x0\n\t"
        /* a[i+126] += m[126] * mu */
        "LDR	r9, [%[m], #504]\n\t"
        "LDR	r12, [%[a], #504]\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r12, r7, r10, r9\n\t"
        "ADDS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #504]\n\t"
        "ADC	r7, r7, #0x0\n\t"
        /* a[i+127] += m[127] * mu */
        "LDR	r9, [%[m], #508]\n\t"
        "LDR	r12, [%[a], #508]\n\t"
        "UMULL	r8, r9, r10, r9\n\t"
        "ADDS	r7, r7, r8\n\t"
        "ADCS	r6, r9, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, r3\n\t"
        "ADDS	r12, r12, r7\n\t"
        "STR	r12, [%[a], #508]\n\t"
        "LDR	r12, [%[a], #512]\n\t"
        "ADCS	r12, r12, r6\n\t"
        "STR	r12, [%[a], #512]\n\t"
        "ADC	r3, r3, #0x0\n\t"
        /* i += 1 */
        "ADD	r11, r11, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r11, #0x200\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_4096_mont_reduce_128_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_4096_mont_reduce_128_word\n\t"
#else
        "BLT.W	L_sp_4096_mont_reduce_128_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r4, [%[a]]\n\t"
        "STR	r5, [%[a], #4]\n\t"
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 4096 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mont_reduce_128_word:\n\t"
#else
    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mont_reduce_128_mul:\n\t"
#else
    "L_sp_4096_mont_reduce_128_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r8, r7\n\t"
        "ADDS	r10, r10, r4\n\t"
        "STR	r10, [%[a], r12]\n\t"
        "ADC	r4, r5, #0x0\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0x200\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_4096_mont_reduce_128_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_4096_mont_reduce_128_mul\n\t"
#else
        "BLT.N	L_sp_4096_mont_reduce_128_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #512]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #512]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0x200\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_4096_mont_reduce_128_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_4096_mont_reduce_128_word\n\t"
#else
        "BLT.N	L_sp_4096_mont_reduce_128_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#else
#ifndef WOLFSSL_SP_SMALL
/* Reduce the number back to 4096 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        /* i = 0 */
        "MOV	r4, #0x0\n\t"
        "MOV	r5, #0x0\n\t"
        "LDR	r6, [%[a]]\n\t"
        "LDR	r7, [%[a], #4]\n\t"
        "LDR	r8, [%[a], #8]\n\t"
        "LDR	r9, [%[a], #12]\n\t"
        "LDR	r10, [%[a], #16]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mont_reduce_128_word:\n\t"
#else
    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "MUL	lr, %[mp], r6\n\t"
        /* a[i+0] += m[0] * mu */
        "LDR	r12, [%[m]]\n\t"
        "MOV	r3, #0x0\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+1] += m[1] * mu */
        "LDR	r12, [%[m], #4]\n\t"
        "MOV	r6, r7\n\t"
        "UMAAL	r6, r3, lr, r12\n\t"
        /* a[i+2] += m[2] * mu */
        "LDR	r12, [%[m], #8]\n\t"
        "MOV	r7, r8\n\t"
        "UMAAL	r7, r3, lr, r12\n\t"
        /* a[i+3] += m[3] * mu */
        "LDR	r12, [%[m], #12]\n\t"
        "MOV	r8, r9\n\t"
        "UMAAL	r8, r3, lr, r12\n\t"
        /* a[i+4] += m[4] * mu */
        "LDR	r12, [%[m], #16]\n\t"
        "MOV	r9, r10\n\t"
        "UMAAL	r9, r3, lr, r12\n\t"
        /* a[i+5] += m[5] * mu */
        "LDR	r12, [%[m], #20]\n\t"
        "LDR	r10, [%[a], #20]\n\t"
        "UMAAL	r10, r3, lr, r12\n\t"
        /* a[i+6] += m[6] * mu */
        "LDR	r12, [%[m], #24]\n\t"
        "LDR	r11, [%[a], #24]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #24]\n\t"
        /* a[i+7] += m[7] * mu */
        "LDR	r12, [%[m], #28]\n\t"
        "LDR	r11, [%[a], #28]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #28]\n\t"
        /* a[i+8] += m[8] * mu */
        "LDR	r12, [%[m], #32]\n\t"
        "LDR	r11, [%[a], #32]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #32]\n\t"
        /* a[i+9] += m[9] * mu */
        "LDR	r12, [%[m], #36]\n\t"
        "LDR	r11, [%[a], #36]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #36]\n\t"
        /* a[i+10] += m[10] * mu */
        "LDR	r12, [%[m], #40]\n\t"
        "LDR	r11, [%[a], #40]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #40]\n\t"
        /* a[i+11] += m[11] * mu */
        "LDR	r12, [%[m], #44]\n\t"
        "LDR	r11, [%[a], #44]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #44]\n\t"
        /* a[i+12] += m[12] * mu */
        "LDR	r12, [%[m], #48]\n\t"
        "LDR	r11, [%[a], #48]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #48]\n\t"
        /* a[i+13] += m[13] * mu */
        "LDR	r12, [%[m], #52]\n\t"
        "LDR	r11, [%[a], #52]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #52]\n\t"
        /* a[i+14] += m[14] * mu */
        "LDR	r12, [%[m], #56]\n\t"
        "LDR	r11, [%[a], #56]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #56]\n\t"
        /* a[i+15] += m[15] * mu */
        "LDR	r12, [%[m], #60]\n\t"
        "LDR	r11, [%[a], #60]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #60]\n\t"
        /* a[i+16] += m[16] * mu */
        "LDR	r12, [%[m], #64]\n\t"
        "LDR	r11, [%[a], #64]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #64]\n\t"
        /* a[i+17] += m[17] * mu */
        "LDR	r12, [%[m], #68]\n\t"
        "LDR	r11, [%[a], #68]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #68]\n\t"
        /* a[i+18] += m[18] * mu */
        "LDR	r12, [%[m], #72]\n\t"
        "LDR	r11, [%[a], #72]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #72]\n\t"
        /* a[i+19] += m[19] * mu */
        "LDR	r12, [%[m], #76]\n\t"
        "LDR	r11, [%[a], #76]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #76]\n\t"
        /* a[i+20] += m[20] * mu */
        "LDR	r12, [%[m], #80]\n\t"
        "LDR	r11, [%[a], #80]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #80]\n\t"
        /* a[i+21] += m[21] * mu */
        "LDR	r12, [%[m], #84]\n\t"
        "LDR	r11, [%[a], #84]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #84]\n\t"
        /* a[i+22] += m[22] * mu */
        "LDR	r12, [%[m], #88]\n\t"
        "LDR	r11, [%[a], #88]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #88]\n\t"
        /* a[i+23] += m[23] * mu */
        "LDR	r12, [%[m], #92]\n\t"
        "LDR	r11, [%[a], #92]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #92]\n\t"
        /* a[i+24] += m[24] * mu */
        "LDR	r12, [%[m], #96]\n\t"
        "LDR	r11, [%[a], #96]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #96]\n\t"
        /* a[i+25] += m[25] * mu */
        "LDR	r12, [%[m], #100]\n\t"
        "LDR	r11, [%[a], #100]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #100]\n\t"
        /* a[i+26] += m[26] * mu */
        "LDR	r12, [%[m], #104]\n\t"
        "LDR	r11, [%[a], #104]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #104]\n\t"
        /* a[i+27] += m[27] * mu */
        "LDR	r12, [%[m], #108]\n\t"
        "LDR	r11, [%[a], #108]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #108]\n\t"
        /* a[i+28] += m[28] * mu */
        "LDR	r12, [%[m], #112]\n\t"
        "LDR	r11, [%[a], #112]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #112]\n\t"
        /* a[i+29] += m[29] * mu */
        "LDR	r12, [%[m], #116]\n\t"
        "LDR	r11, [%[a], #116]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #116]\n\t"
        /* a[i+30] += m[30] * mu */
        "LDR	r12, [%[m], #120]\n\t"
        "LDR	r11, [%[a], #120]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #120]\n\t"
        /* a[i+31] += m[31] * mu */
        "LDR	r12, [%[m], #124]\n\t"
        "LDR	r11, [%[a], #124]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #124]\n\t"
        /* a[i+32] += m[32] * mu */
        "LDR	r12, [%[m], #128]\n\t"
        "LDR	r11, [%[a], #128]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #128]\n\t"
        /* a[i+33] += m[33] * mu */
        "LDR	r12, [%[m], #132]\n\t"
        "LDR	r11, [%[a], #132]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #132]\n\t"
        /* a[i+34] += m[34] * mu */
        "LDR	r12, [%[m], #136]\n\t"
        "LDR	r11, [%[a], #136]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #136]\n\t"
        /* a[i+35] += m[35] * mu */
        "LDR	r12, [%[m], #140]\n\t"
        "LDR	r11, [%[a], #140]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #140]\n\t"
        /* a[i+36] += m[36] * mu */
        "LDR	r12, [%[m], #144]\n\t"
        "LDR	r11, [%[a], #144]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #144]\n\t"
        /* a[i+37] += m[37] * mu */
        "LDR	r12, [%[m], #148]\n\t"
        "LDR	r11, [%[a], #148]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #148]\n\t"
        /* a[i+38] += m[38] * mu */
        "LDR	r12, [%[m], #152]\n\t"
        "LDR	r11, [%[a], #152]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #152]\n\t"
        /* a[i+39] += m[39] * mu */
        "LDR	r12, [%[m], #156]\n\t"
        "LDR	r11, [%[a], #156]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #156]\n\t"
        /* a[i+40] += m[40] * mu */
        "LDR	r12, [%[m], #160]\n\t"
        "LDR	r11, [%[a], #160]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #160]\n\t"
        /* a[i+41] += m[41] * mu */
        "LDR	r12, [%[m], #164]\n\t"
        "LDR	r11, [%[a], #164]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #164]\n\t"
        /* a[i+42] += m[42] * mu */
        "LDR	r12, [%[m], #168]\n\t"
        "LDR	r11, [%[a], #168]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #168]\n\t"
        /* a[i+43] += m[43] * mu */
        "LDR	r12, [%[m], #172]\n\t"
        "LDR	r11, [%[a], #172]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #172]\n\t"
        /* a[i+44] += m[44] * mu */
        "LDR	r12, [%[m], #176]\n\t"
        "LDR	r11, [%[a], #176]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #176]\n\t"
        /* a[i+45] += m[45] * mu */
        "LDR	r12, [%[m], #180]\n\t"
        "LDR	r11, [%[a], #180]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #180]\n\t"
        /* a[i+46] += m[46] * mu */
        "LDR	r12, [%[m], #184]\n\t"
        "LDR	r11, [%[a], #184]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #184]\n\t"
        /* a[i+47] += m[47] * mu */
        "LDR	r12, [%[m], #188]\n\t"
        "LDR	r11, [%[a], #188]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #188]\n\t"
        /* a[i+48] += m[48] * mu */
        "LDR	r12, [%[m], #192]\n\t"
        "LDR	r11, [%[a], #192]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #192]\n\t"
        /* a[i+49] += m[49] * mu */
        "LDR	r12, [%[m], #196]\n\t"
        "LDR	r11, [%[a], #196]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #196]\n\t"
        /* a[i+50] += m[50] * mu */
        "LDR	r12, [%[m], #200]\n\t"
        "LDR	r11, [%[a], #200]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #200]\n\t"
        /* a[i+51] += m[51] * mu */
        "LDR	r12, [%[m], #204]\n\t"
        "LDR	r11, [%[a], #204]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #204]\n\t"
        /* a[i+52] += m[52] * mu */
        "LDR	r12, [%[m], #208]\n\t"
        "LDR	r11, [%[a], #208]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #208]\n\t"
        /* a[i+53] += m[53] * mu */
        "LDR	r12, [%[m], #212]\n\t"
        "LDR	r11, [%[a], #212]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #212]\n\t"
        /* a[i+54] += m[54] * mu */
        "LDR	r12, [%[m], #216]\n\t"
        "LDR	r11, [%[a], #216]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #216]\n\t"
        /* a[i+55] += m[55] * mu */
        "LDR	r12, [%[m], #220]\n\t"
        "LDR	r11, [%[a], #220]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #220]\n\t"
        /* a[i+56] += m[56] * mu */
        "LDR	r12, [%[m], #224]\n\t"
        "LDR	r11, [%[a], #224]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #224]\n\t"
        /* a[i+57] += m[57] * mu */
        "LDR	r12, [%[m], #228]\n\t"
        "LDR	r11, [%[a], #228]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #228]\n\t"
        /* a[i+58] += m[58] * mu */
        "LDR	r12, [%[m], #232]\n\t"
        "LDR	r11, [%[a], #232]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #232]\n\t"
        /* a[i+59] += m[59] * mu */
        "LDR	r12, [%[m], #236]\n\t"
        "LDR	r11, [%[a], #236]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #236]\n\t"
        /* a[i+60] += m[60] * mu */
        "LDR	r12, [%[m], #240]\n\t"
        "LDR	r11, [%[a], #240]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #240]\n\t"
        /* a[i+61] += m[61] * mu */
        "LDR	r12, [%[m], #244]\n\t"
        "LDR	r11, [%[a], #244]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #244]\n\t"
        /* a[i+62] += m[62] * mu */
        "LDR	r12, [%[m], #248]\n\t"
        "LDR	r11, [%[a], #248]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #248]\n\t"
        /* a[i+63] += m[63] * mu */
        "LDR	r12, [%[m], #252]\n\t"
        "LDR	r11, [%[a], #252]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #252]\n\t"
        /* a[i+64] += m[64] * mu */
        "LDR	r12, [%[m], #256]\n\t"
        "LDR	r11, [%[a], #256]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #256]\n\t"
        /* a[i+65] += m[65] * mu */
        "LDR	r12, [%[m], #260]\n\t"
        "LDR	r11, [%[a], #260]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #260]\n\t"
        /* a[i+66] += m[66] * mu */
        "LDR	r12, [%[m], #264]\n\t"
        "LDR	r11, [%[a], #264]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #264]\n\t"
        /* a[i+67] += m[67] * mu */
        "LDR	r12, [%[m], #268]\n\t"
        "LDR	r11, [%[a], #268]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #268]\n\t"
        /* a[i+68] += m[68] * mu */
        "LDR	r12, [%[m], #272]\n\t"
        "LDR	r11, [%[a], #272]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #272]\n\t"
        /* a[i+69] += m[69] * mu */
        "LDR	r12, [%[m], #276]\n\t"
        "LDR	r11, [%[a], #276]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #276]\n\t"
        /* a[i+70] += m[70] * mu */
        "LDR	r12, [%[m], #280]\n\t"
        "LDR	r11, [%[a], #280]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #280]\n\t"
        /* a[i+71] += m[71] * mu */
        "LDR	r12, [%[m], #284]\n\t"
        "LDR	r11, [%[a], #284]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #284]\n\t"
        /* a[i+72] += m[72] * mu */
        "LDR	r12, [%[m], #288]\n\t"
        "LDR	r11, [%[a], #288]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #288]\n\t"
        /* a[i+73] += m[73] * mu */
        "LDR	r12, [%[m], #292]\n\t"
        "LDR	r11, [%[a], #292]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #292]\n\t"
        /* a[i+74] += m[74] * mu */
        "LDR	r12, [%[m], #296]\n\t"
        "LDR	r11, [%[a], #296]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #296]\n\t"
        /* a[i+75] += m[75] * mu */
        "LDR	r12, [%[m], #300]\n\t"
        "LDR	r11, [%[a], #300]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #300]\n\t"
        /* a[i+76] += m[76] * mu */
        "LDR	r12, [%[m], #304]\n\t"
        "LDR	r11, [%[a], #304]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #304]\n\t"
        /* a[i+77] += m[77] * mu */
        "LDR	r12, [%[m], #308]\n\t"
        "LDR	r11, [%[a], #308]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #308]\n\t"
        /* a[i+78] += m[78] * mu */
        "LDR	r12, [%[m], #312]\n\t"
        "LDR	r11, [%[a], #312]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #312]\n\t"
        /* a[i+79] += m[79] * mu */
        "LDR	r12, [%[m], #316]\n\t"
        "LDR	r11, [%[a], #316]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #316]\n\t"
        /* a[i+80] += m[80] * mu */
        "LDR	r12, [%[m], #320]\n\t"
        "LDR	r11, [%[a], #320]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #320]\n\t"
        /* a[i+81] += m[81] * mu */
        "LDR	r12, [%[m], #324]\n\t"
        "LDR	r11, [%[a], #324]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #324]\n\t"
        /* a[i+82] += m[82] * mu */
        "LDR	r12, [%[m], #328]\n\t"
        "LDR	r11, [%[a], #328]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #328]\n\t"
        /* a[i+83] += m[83] * mu */
        "LDR	r12, [%[m], #332]\n\t"
        "LDR	r11, [%[a], #332]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #332]\n\t"
        /* a[i+84] += m[84] * mu */
        "LDR	r12, [%[m], #336]\n\t"
        "LDR	r11, [%[a], #336]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #336]\n\t"
        /* a[i+85] += m[85] * mu */
        "LDR	r12, [%[m], #340]\n\t"
        "LDR	r11, [%[a], #340]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #340]\n\t"
        /* a[i+86] += m[86] * mu */
        "LDR	r12, [%[m], #344]\n\t"
        "LDR	r11, [%[a], #344]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #344]\n\t"
        /* a[i+87] += m[87] * mu */
        "LDR	r12, [%[m], #348]\n\t"
        "LDR	r11, [%[a], #348]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #348]\n\t"
        /* a[i+88] += m[88] * mu */
        "LDR	r12, [%[m], #352]\n\t"
        "LDR	r11, [%[a], #352]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #352]\n\t"
        /* a[i+89] += m[89] * mu */
        "LDR	r12, [%[m], #356]\n\t"
        "LDR	r11, [%[a], #356]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #356]\n\t"
        /* a[i+90] += m[90] * mu */
        "LDR	r12, [%[m], #360]\n\t"
        "LDR	r11, [%[a], #360]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #360]\n\t"
        /* a[i+91] += m[91] * mu */
        "LDR	r12, [%[m], #364]\n\t"
        "LDR	r11, [%[a], #364]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #364]\n\t"
        /* a[i+92] += m[92] * mu */
        "LDR	r12, [%[m], #368]\n\t"
        "LDR	r11, [%[a], #368]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #368]\n\t"
        /* a[i+93] += m[93] * mu */
        "LDR	r12, [%[m], #372]\n\t"
        "LDR	r11, [%[a], #372]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #372]\n\t"
        /* a[i+94] += m[94] * mu */
        "LDR	r12, [%[m], #376]\n\t"
        "LDR	r11, [%[a], #376]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #376]\n\t"
        /* a[i+95] += m[95] * mu */
        "LDR	r12, [%[m], #380]\n\t"
        "LDR	r11, [%[a], #380]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #380]\n\t"
        /* a[i+96] += m[96] * mu */
        "LDR	r12, [%[m], #384]\n\t"
        "LDR	r11, [%[a], #384]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #384]\n\t"
        /* a[i+97] += m[97] * mu */
        "LDR	r12, [%[m], #388]\n\t"
        "LDR	r11, [%[a], #388]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #388]\n\t"
        /* a[i+98] += m[98] * mu */
        "LDR	r12, [%[m], #392]\n\t"
        "LDR	r11, [%[a], #392]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #392]\n\t"
        /* a[i+99] += m[99] * mu */
        "LDR	r12, [%[m], #396]\n\t"
        "LDR	r11, [%[a], #396]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #396]\n\t"
        /* a[i+100] += m[100] * mu */
        "LDR	r12, [%[m], #400]\n\t"
        "LDR	r11, [%[a], #400]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #400]\n\t"
        /* a[i+101] += m[101] * mu */
        "LDR	r12, [%[m], #404]\n\t"
        "LDR	r11, [%[a], #404]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #404]\n\t"
        /* a[i+102] += m[102] * mu */
        "LDR	r12, [%[m], #408]\n\t"
        "LDR	r11, [%[a], #408]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #408]\n\t"
        /* a[i+103] += m[103] * mu */
        "LDR	r12, [%[m], #412]\n\t"
        "LDR	r11, [%[a], #412]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #412]\n\t"
        /* a[i+104] += m[104] * mu */
        "LDR	r12, [%[m], #416]\n\t"
        "LDR	r11, [%[a], #416]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #416]\n\t"
        /* a[i+105] += m[105] * mu */
        "LDR	r12, [%[m], #420]\n\t"
        "LDR	r11, [%[a], #420]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #420]\n\t"
        /* a[i+106] += m[106] * mu */
        "LDR	r12, [%[m], #424]\n\t"
        "LDR	r11, [%[a], #424]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #424]\n\t"
        /* a[i+107] += m[107] * mu */
        "LDR	r12, [%[m], #428]\n\t"
        "LDR	r11, [%[a], #428]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #428]\n\t"
        /* a[i+108] += m[108] * mu */
        "LDR	r12, [%[m], #432]\n\t"
        "LDR	r11, [%[a], #432]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #432]\n\t"
        /* a[i+109] += m[109] * mu */
        "LDR	r12, [%[m], #436]\n\t"
        "LDR	r11, [%[a], #436]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #436]\n\t"
        /* a[i+110] += m[110] * mu */
        "LDR	r12, [%[m], #440]\n\t"
        "LDR	r11, [%[a], #440]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #440]\n\t"
        /* a[i+111] += m[111] * mu */
        "LDR	r12, [%[m], #444]\n\t"
        "LDR	r11, [%[a], #444]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #444]\n\t"
        /* a[i+112] += m[112] * mu */
        "LDR	r12, [%[m], #448]\n\t"
        "LDR	r11, [%[a], #448]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #448]\n\t"
        /* a[i+113] += m[113] * mu */
        "LDR	r12, [%[m], #452]\n\t"
        "LDR	r11, [%[a], #452]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #452]\n\t"
        /* a[i+114] += m[114] * mu */
        "LDR	r12, [%[m], #456]\n\t"
        "LDR	r11, [%[a], #456]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #456]\n\t"
        /* a[i+115] += m[115] * mu */
        "LDR	r12, [%[m], #460]\n\t"
        "LDR	r11, [%[a], #460]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #460]\n\t"
        /* a[i+116] += m[116] * mu */
        "LDR	r12, [%[m], #464]\n\t"
        "LDR	r11, [%[a], #464]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #464]\n\t"
        /* a[i+117] += m[117] * mu */
        "LDR	r12, [%[m], #468]\n\t"
        "LDR	r11, [%[a], #468]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #468]\n\t"
        /* a[i+118] += m[118] * mu */
        "LDR	r12, [%[m], #472]\n\t"
        "LDR	r11, [%[a], #472]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #472]\n\t"
        /* a[i+119] += m[119] * mu */
        "LDR	r12, [%[m], #476]\n\t"
        "LDR	r11, [%[a], #476]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #476]\n\t"
        /* a[i+120] += m[120] * mu */
        "LDR	r12, [%[m], #480]\n\t"
        "LDR	r11, [%[a], #480]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #480]\n\t"
        /* a[i+121] += m[121] * mu */
        "LDR	r12, [%[m], #484]\n\t"
        "LDR	r11, [%[a], #484]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #484]\n\t"
        /* a[i+122] += m[122] * mu */
        "LDR	r12, [%[m], #488]\n\t"
        "LDR	r11, [%[a], #488]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #488]\n\t"
        /* a[i+123] += m[123] * mu */
        "LDR	r12, [%[m], #492]\n\t"
        "LDR	r11, [%[a], #492]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #492]\n\t"
        /* a[i+124] += m[124] * mu */
        "LDR	r12, [%[m], #496]\n\t"
        "LDR	r11, [%[a], #496]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #496]\n\t"
        /* a[i+125] += m[125] * mu */
        "LDR	r12, [%[m], #500]\n\t"
        "LDR	r11, [%[a], #500]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #500]\n\t"
        /* a[i+126] += m[126] * mu */
        "LDR	r12, [%[m], #504]\n\t"
        "LDR	r11, [%[a], #504]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "STR	r11, [%[a], #504]\n\t"
        /* a[i+127] += m[127] * mu */
        "LDR	r12, [%[m], #508]\n\t"
        "LDR	r11, [%[a], #508]\n\t"
        "UMAAL	r11, r3, lr, r12\n\t"
        "LDR	lr, [%[a], #512]\n\t"
        "MOV	r12, #0x0\n\t"
        "UMAAL	r3, lr, r12, r12\n\t"
        "STR	r11, [%[a], #508]\n\t"
        "ADDS	r3, r3, r5\n\t"
        "ADC	r5, lr, #0x0\n\t"
        "STR	r3, [%[a], #512]\n\t"
        /* i += 1 */
        "ADD	r4, r4, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r4, #0x200\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_4096_mont_reduce_128_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.W	L_sp_4096_mont_reduce_128_word\n\t"
#else
        "BLT.W	L_sp_4096_mont_reduce_128_word_%=\n\t"
#endif
        /* Loop Done */
        "STR	r6, [%[a]]\n\t"
        "STR	r7, [%[a], #4]\n\t"
        "STR	r8, [%[a], #8]\n\t"
        "STR	r9, [%[a], #12]\n\t"
        "STR	r10, [%[a], #16]\n\t"
        "MOV	%[mp], r5\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
    sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp);
}

#else
/* Reduce the number back to 4096 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p)
#else
SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* a __asm__ ("r0") = (sp_digit*)a_p;
    register const sp_digit* m __asm__ ("r1") = (const sp_digit*)m_p;
    register sp_digit mp __asm__ ("r2") = (sp_digit)mp_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDR	r11, [%[m]]\n\t"
        /* i = 0 */
        "MOV	r9, #0x0\n\t"
        /* ca = 0 */
        "MOV	r3, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mont_reduce_128_word:\n\t"
#else
    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
#endif
        /* mu = a[i] * mp */
        "LDR	r10, [%[a]]\n\t"
        "MUL	r8, %[mp], r10\n\t"
        /* j = 0 */
        "MOV	r12, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_mont_reduce_128_mul:\n\t"
#else
    "L_sp_4096_mont_reduce_128_mul_%=:\n\t"
#endif
        /* a[i+j+0] += m[j+0] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+1] += m[j+1] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+2] += m[j+2] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        /* a[i+j+3] += m[j+3] * mu */
        "LDR	r7, [%[m], r12]\n\t"
        "LDR	r10, [%[a], r12]\n\t"
        "UMAAL	r10, r4, r8, r7\n\t"
        "STR	r10, [%[a], r12]\n\t"
        /* j += 1 */
        "ADD	r12, r12, #0x4\n\t"
        "CMP	r12, #0x200\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_4096_mont_reduce_128_mul_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_4096_mont_reduce_128_mul\n\t"
#else
        "BLT.N	L_sp_4096_mont_reduce_128_mul_%=\n\t"
#endif
        "LDR	r10, [%[a], #512]\n\t"
        "ADDS	r4, r4, r3\n\t"
        "MOV	r3, #0x0\n\t"
        "ADC	r3, r3, #0x0\n\t"
        "ADDS	r10, r10, r4\n\t"
        "ADC	r3, r3, r3\n\t"
        "STR	r10, [%[a], #512]\n\t"
        /* i += 1 */
        "ADD	r9, r9, #0x4\n\t"
        "ADD	%[a], %[a], #0x4\n\t"
        "CMP	r9, #0x200\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_4096_mont_reduce_128_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_4096_mont_reduce_128_word\n\t"
#else
        "BLT.N	L_sp_4096_mont_reduce_128_word_%=\n\t"
#endif
        /* Loop Done */
        "MOV	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp);
}

#endif /* !WOLFSSL_SP_SMALL */
#endif
/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_mul_128(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_4096_mul_128(r, a, b);
    sp_4096_mont_reduce_128(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_4096_sqr_128(r, a);
    sp_4096_mont_reduce_128(r, m, mp);
}

#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r11, #0x0\n\t"
        "ADD	r12, %[a], #0x200\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_sub_128_word:\n\t"
#else
    "L_sp_4096_sub_128_word_%=:\n\t"
#endif
        "RSBS	r11, r11, #0x0\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	r11, r3, r3\n\t"
        "CMP	%[a], r12\n\t"
#if defined(__GNUC__)
        "BNE	L_sp_4096_sub_128_word_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BNE.N	L_sp_4096_sub_128_word\n\t"
#else
        "BNE.N	L_sp_4096_sub_128_word_%=\n\t"
#endif
        "MOV	%[r], r11\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SUBS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
        "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
        "SBCS	r3, r3, r7\n\t"
        "SBCS	r4, r4, r8\n\t"
        "SBCS	r5, r5, r9\n\t"
        "SBCS	r6, r6, r10\n\t"
        "STM	%[r]!, {r3, r4, r5, r6}\n\t"
        "SBC	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_USE_UDIV
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r8, %[div], #16\n\t"
        "ADD	r5, r8, #0x1\n\t"
        "UDIV	r6, %[d1], r5\n\t"
        "LSL	r7, %[div], #16\n\t"
        "LSL	r6, r6, #16\n\t"
        "UMULL	r3, r4, %[div], r6\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "SUBS	r3, %[d1], r5\n\t"
        "SBC	r9, r9, r9\n\t"
        "ADD	r9, r9, #0x1\n\t"
        "RSB	r10, r9, #0x0\n\t"
        "LSL	r9, r9, #16\n\t"
        "AND	r7, r7, r10\n\t"
        "AND	r8, r8, r10\n\t"
        "SUBS	%[d0], %[d0], r7\n\t"
        "ADD	r6, r6, r9\n\t"
        "SBC	%[d1], %[d1], r8\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "UMULL	r3, r4, %[div], r3\n\t"
        "SUBS	%[d0], %[d0], r3\n\t"
        "SBC	%[d1], %[d1], r4\n\t"
        "LSL	r4, %[d1], #16\n\t"
        "LSR	r3, %[d0], #16\n\t"
        "ORR	r3, r3, r4\n\t"
        "UDIV	r3, r3, r5\n\t"
        "ADD	r6, r6, r3\n\t"
        "MUL	r3, %[div], r3\n\t"
        "SUB	%[d0], %[d0], r3\n\t"
        "UDIV	r3, %[d0], %[div]\n\t"
        "ADD	%[d1], r6, r3\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#else
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
#else
SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit d1 __asm__ ("r0") = (sp_digit)d1_p;
    register sp_digit d0 __asm__ ("r1") = (sp_digit)d0_p;
    register sp_digit div __asm__ ("r2") = (sp_digit)div_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "LSR	r5, %[div], #1\n\t"
        "ADD	r5, r5, #0x1\n\t"
        "MOV	r6, %[d0]\n\t"
        "MOV	r7, %[d1]\n\t"
        /* Do top 32 */
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "MOV	r3, #0x0\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        /* Next 30 bits */
        "MOV	r4, #0x1d\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_div_4096_word_128_bit:\n\t"
#else
    "L_div_4096_word_128_bit_%=:\n\t"
#endif
        "LSLS	r6, r6, #1\n\t"
        "ADC	r7, r7, r7\n\t"
        "SUBS	r8, r5, r7\n\t"
        "SBC	r8, r8, r8\n\t"
        "ADD	r3, r3, r3\n\t"
        "SUB	r3, r3, r8\n\t"
        "AND	r8, r8, r5\n\t"
        "SUBS	r7, r7, r8\n\t"
        "SUBS	r4, r4, #0x1\n\t"
#if defined(__GNUC__)
        "BPL	L_div_4096_word_128_bit_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BPL.N	L_div_4096_word_128_bit\n\t"
#else
        "BPL.N	L_div_4096_word_128_bit_%=\n\t"
#endif
        "ADD	r3, r3, r3\n\t"
        "ADD	r3, r3, #0x1\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "UMULL	r6, r7, r3, %[div]\n\t"
        "SUBS	r9, %[d0], r6\n\t"
        "SBC	r10, %[d1], r7\n\t"
        "ADD	r3, r3, r10\n\t"
        "SUBS	r8, %[div], r9\n\t"
        "SBC	r8, r8, r8\n\t"
        "SUB	%[d1], r3, r8\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)d1;
}

#endif
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d,
        sp_digit* m, sp_digit* r)
{
    sp_digit t1[256], t2[129];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[127];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 128);
    for (i = 127; i > 0; i--) {
        if (t1[i + 128] != d[i])
            break;
    }
    if (t1[i + 128] >= d[i]) {
        sp_4096_sub_in_place_128(&t1[128], d);
    }
    for (i = 127; i >= 0; i--) {
        if (t1[128 + i] == div) {
            r1 = SP_DIGIT_MAX;
        }
        else {
            r1 = div_4096_word_128(t1[128 + i], t1[128 + i - 1], div);
        }

        sp_4096_mul_d_128(t2, d, r1);
        t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2);
        t1[128 + i] -= t2[128];
        if (t1[128 + i] != 0) {
            t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d);
            if (t1[128 + i] != 0)
                t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d);
        }
    }

    for (i = 127; i > 0; i--) {
        if (t1[i] != d[i])
            break;
    }
    if (t1[i] >= d[i]) {
        sp_4096_sub_128(r, t1, d);
    }
    else {
        XMEMCPY(r, t1, sizeof(*t1) * 128);
    }

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_4096_div_128_cond(a, m, NULL, r);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY)
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_4096_mask_128(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<128; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 128; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_int32 sp_4096_cmp_128(const sp_digit* a_p, const sp_digit* b_p)
#else
static sp_int32 sp_4096_cmp_128(const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register const sp_digit* a __asm__ ("r0") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r1") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r2, #0xffffffff\n\t"
        "MOV	r8, #0x1\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r3, #0xffffffff\n\t"
#ifdef WOLFSSL_SP_SMALL
        "MOV	r6, #0x1fc\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_cmp_128_words:\n\t"
#else
    "L_sp_4096_cmp_128_words_%=:\n\t"
#endif
        "LDR	r4, [%[a], r6]\n\t"
        "LDR	r5, [%[b], r6]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "SUBS	r6, r6, #0x4\n\t"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "bcs	L_sp_4096_cmp_128_words\n\t"
#else
        "bcs	L_sp_4096_cmp_128_words_%=\n\t"
#endif
        "EOR	r2, r2, r3\n\t"
#else
        "LDR	r4, [%[a], #508]\n\t"
        "LDR	r5, [%[b], #508]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #504]\n\t"
        "LDR	r5, [%[b], #504]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #500]\n\t"
        "LDR	r5, [%[b], #500]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #496]\n\t"
        "LDR	r5, [%[b], #496]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #492]\n\t"
        "LDR	r5, [%[b], #492]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #488]\n\t"
        "LDR	r5, [%[b], #488]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #484]\n\t"
        "LDR	r5, [%[b], #484]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #480]\n\t"
        "LDR	r5, [%[b], #480]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #476]\n\t"
        "LDR	r5, [%[b], #476]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #472]\n\t"
        "LDR	r5, [%[b], #472]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #468]\n\t"
        "LDR	r5, [%[b], #468]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #464]\n\t"
        "LDR	r5, [%[b], #464]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #460]\n\t"
        "LDR	r5, [%[b], #460]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #456]\n\t"
        "LDR	r5, [%[b], #456]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #452]\n\t"
        "LDR	r5, [%[b], #452]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #448]\n\t"
        "LDR	r5, [%[b], #448]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #444]\n\t"
        "LDR	r5, [%[b], #444]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #440]\n\t"
        "LDR	r5, [%[b], #440]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #436]\n\t"
        "LDR	r5, [%[b], #436]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #432]\n\t"
        "LDR	r5, [%[b], #432]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #428]\n\t"
        "LDR	r5, [%[b], #428]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #424]\n\t"
        "LDR	r5, [%[b], #424]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #420]\n\t"
        "LDR	r5, [%[b], #420]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #416]\n\t"
        "LDR	r5, [%[b], #416]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #412]\n\t"
        "LDR	r5, [%[b], #412]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #408]\n\t"
        "LDR	r5, [%[b], #408]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #404]\n\t"
        "LDR	r5, [%[b], #404]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #400]\n\t"
        "LDR	r5, [%[b], #400]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #396]\n\t"
        "LDR	r5, [%[b], #396]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #392]\n\t"
        "LDR	r5, [%[b], #392]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #388]\n\t"
        "LDR	r5, [%[b], #388]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #384]\n\t"
        "LDR	r5, [%[b], #384]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #380]\n\t"
        "LDR	r5, [%[b], #380]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #376]\n\t"
        "LDR	r5, [%[b], #376]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #372]\n\t"
        "LDR	r5, [%[b], #372]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #368]\n\t"
        "LDR	r5, [%[b], #368]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #364]\n\t"
        "LDR	r5, [%[b], #364]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #360]\n\t"
        "LDR	r5, [%[b], #360]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #356]\n\t"
        "LDR	r5, [%[b], #356]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #352]\n\t"
        "LDR	r5, [%[b], #352]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #348]\n\t"
        "LDR	r5, [%[b], #348]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #344]\n\t"
        "LDR	r5, [%[b], #344]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #340]\n\t"
        "LDR	r5, [%[b], #340]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #336]\n\t"
        "LDR	r5, [%[b], #336]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #332]\n\t"
        "LDR	r5, [%[b], #332]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #328]\n\t"
        "LDR	r5, [%[b], #328]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #324]\n\t"
        "LDR	r5, [%[b], #324]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #320]\n\t"
        "LDR	r5, [%[b], #320]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #316]\n\t"
        "LDR	r5, [%[b], #316]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #312]\n\t"
        "LDR	r5, [%[b], #312]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #308]\n\t"
        "LDR	r5, [%[b], #308]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #304]\n\t"
        "LDR	r5, [%[b], #304]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #300]\n\t"
        "LDR	r5, [%[b], #300]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #296]\n\t"
        "LDR	r5, [%[b], #296]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #292]\n\t"
        "LDR	r5, [%[b], #292]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #288]\n\t"
        "LDR	r5, [%[b], #288]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #284]\n\t"
        "LDR	r5, [%[b], #284]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #280]\n\t"
        "LDR	r5, [%[b], #280]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #276]\n\t"
        "LDR	r5, [%[b], #276]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #272]\n\t"
        "LDR	r5, [%[b], #272]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #268]\n\t"
        "LDR	r5, [%[b], #268]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #264]\n\t"
        "LDR	r5, [%[b], #264]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #260]\n\t"
        "LDR	r5, [%[b], #260]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #256]\n\t"
        "LDR	r5, [%[b], #256]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #252]\n\t"
        "LDR	r5, [%[b], #252]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #248]\n\t"
        "LDR	r5, [%[b], #248]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #244]\n\t"
        "LDR	r5, [%[b], #244]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #240]\n\t"
        "LDR	r5, [%[b], #240]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #236]\n\t"
        "LDR	r5, [%[b], #236]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #232]\n\t"
        "LDR	r5, [%[b], #232]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #228]\n\t"
        "LDR	r5, [%[b], #228]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #224]\n\t"
        "LDR	r5, [%[b], #224]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #220]\n\t"
        "LDR	r5, [%[b], #220]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #216]\n\t"
        "LDR	r5, [%[b], #216]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #212]\n\t"
        "LDR	r5, [%[b], #212]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #208]\n\t"
        "LDR	r5, [%[b], #208]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #204]\n\t"
        "LDR	r5, [%[b], #204]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #200]\n\t"
        "LDR	r5, [%[b], #200]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #196]\n\t"
        "LDR	r5, [%[b], #196]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #192]\n\t"
        "LDR	r5, [%[b], #192]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #188]\n\t"
        "LDR	r5, [%[b], #188]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #184]\n\t"
        "LDR	r5, [%[b], #184]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #180]\n\t"
        "LDR	r5, [%[b], #180]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #176]\n\t"
        "LDR	r5, [%[b], #176]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #172]\n\t"
        "LDR	r5, [%[b], #172]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #168]\n\t"
        "LDR	r5, [%[b], #168]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #164]\n\t"
        "LDR	r5, [%[b], #164]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #160]\n\t"
        "LDR	r5, [%[b], #160]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #156]\n\t"
        "LDR	r5, [%[b], #156]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #152]\n\t"
        "LDR	r5, [%[b], #152]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #148]\n\t"
        "LDR	r5, [%[b], #148]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #144]\n\t"
        "LDR	r5, [%[b], #144]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #140]\n\t"
        "LDR	r5, [%[b], #140]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #136]\n\t"
        "LDR	r5, [%[b], #136]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #132]\n\t"
        "LDR	r5, [%[b], #132]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #128]\n\t"
        "LDR	r5, [%[b], #128]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #124]\n\t"
        "LDR	r5, [%[b], #124]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #120]\n\t"
        "LDR	r5, [%[b], #120]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #116]\n\t"
        "LDR	r5, [%[b], #116]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #112]\n\t"
        "LDR	r5, [%[b], #112]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #108]\n\t"
        "LDR	r5, [%[b], #108]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #104]\n\t"
        "LDR	r5, [%[b], #104]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #100]\n\t"
        "LDR	r5, [%[b], #100]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #96]\n\t"
        "LDR	r5, [%[b], #96]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #92]\n\t"
        "LDR	r5, [%[b], #92]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #88]\n\t"
        "LDR	r5, [%[b], #88]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #84]\n\t"
        "LDR	r5, [%[b], #84]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #80]\n\t"
        "LDR	r5, [%[b], #80]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #76]\n\t"
        "LDR	r5, [%[b], #76]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #72]\n\t"
        "LDR	r5, [%[b], #72]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #68]\n\t"
        "LDR	r5, [%[b], #68]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #64]\n\t"
        "LDR	r5, [%[b], #64]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #60]\n\t"
        "LDR	r5, [%[b], #60]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #56]\n\t"
        "LDR	r5, [%[b], #56]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #52]\n\t"
        "LDR	r5, [%[b], #52]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #48]\n\t"
        "LDR	r5, [%[b], #48]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #44]\n\t"
        "LDR	r5, [%[b], #44]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #40]\n\t"
        "LDR	r5, [%[b], #40]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #36]\n\t"
        "LDR	r5, [%[b], #36]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #32]\n\t"
        "LDR	r5, [%[b], #32]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #28]\n\t"
        "LDR	r5, [%[b], #28]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #24]\n\t"
        "LDR	r5, [%[b], #24]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #20]\n\t"
        "LDR	r5, [%[b], #20]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #16]\n\t"
        "LDR	r5, [%[b], #16]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #12]\n\t"
        "LDR	r5, [%[b], #12]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #8]\n\t"
        "LDR	r5, [%[b], #8]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a], #4]\n\t"
        "LDR	r5, [%[b], #4]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "LDR	r4, [%[a]]\n\t"
        "LDR	r5, [%[b]]\n\t"
        "AND	r4, r4, r3\n\t"
        "AND	r5, r5, r3\n\t"
        "SUBS	r4, r4, r5\n\t"
        "IT	hi\n\t"
        "movhi	r2, r8\n\t"
        "IT	lo\n\t"
        "movlo	r2, r3\n\t"
        "IT	ne\n\t"
        "movne	r3, r7\n\t"
        "EOR	r2, r2, r3\n\t"
#endif /*WOLFSSL_SP_SMALL */
        "MOV	%[a], r2\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)a;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d,
        sp_digit* m, sp_digit* r)
{
    sp_digit t1[256], t2[129];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[127];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 128);
    r1 = sp_4096_cmp_128(&t1[128], d) >= 0;
    sp_4096_cond_sub_128(&t1[128], &t1[128], d, (sp_digit)0 - r1);
    for (i = 127; i >= 0; i--) {
        volatile sp_digit mask = (sp_digit)0 - (t1[128 + i] == div);
        sp_digit hi = t1[128 + i] + mask;
        r1 = div_4096_word_128(hi, t1[128 + i - 1], div);
        r1 |= mask;

        sp_4096_mul_d_128(t2, d, r1);
        t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2);
        t1[128 + i] -= t2[128];
        sp_4096_mask_128(t2, d, t1[128 + i]);
        t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], t2);
        sp_4096_mask_128(t2, d, t1[128 + i]);
        t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], t2);
    }

    r1 = sp_4096_cmp_128(t1, d) >= 0;
    sp_4096_cond_sub_128(r, t1, d, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_4096_mod_128(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_4096_div_128(a, m, NULL, r);
}

#endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
                                                     defined(WOLFSSL_HAVE_SP_DH)
#ifdef WOLFSSL_SP_SMALL
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[8 * 256];
#endif
    sp_digit* t[8];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 256), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<8; i++) {
            t[i] = td + i * 256;
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_128(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 128U);
        if (reduceA != 0) {
            err = sp_4096_mod_128(t[1] + 128, a, m);
            if (err == MP_OKAY) {
                err = sp_4096_mod_128(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 128, a, sizeof(sp_digit) * 128);
            err = sp_4096_mod_128(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_4096_mont_sqr_128(t[ 2], t[ 1], m, mp);
        sp_4096_mont_mul_128(t[ 3], t[ 2], t[ 1], m, mp);
        sp_4096_mont_sqr_128(t[ 4], t[ 2], m, mp);
        sp_4096_mont_mul_128(t[ 5], t[ 3], t[ 2], m, mp);
        sp_4096_mont_sqr_128(t[ 6], t[ 3], m, mp);
        sp_4096_mont_mul_128(t[ 7], t[ 4], t[ 3], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 3;
        if (c == 32) {
            c = 29;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 128);
        for (; i>=0 || c>=3; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 29);
                n <<= 3;
                c = 29;
            }
            else if (c < 3) {
                y = (byte)(n >> 29);
                n = e[i--];
                c = 3 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 29) & 0x7);
                n <<= 3;
                c -= 3;
            }

            sp_4096_mont_sqr_128(r, r, m, mp);
            sp_4096_mont_sqr_128(r, r, m, mp);
            sp_4096_mont_sqr_128(r, r, m, mp);

            sp_4096_mont_mul_128(r, r, t[y], m, mp);
        }

        XMEMSET(&r[128], 0, sizeof(sp_digit) * 128U);
        sp_4096_mont_reduce_128(r, m, mp);

        mask = (sp_digit)0 - (sp_4096_cmp_128(r, m) >= 0);
        sp_4096_cond_sub_128(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#else
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[16 * 256];
#endif
    sp_digit* t[16];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (16 * 256), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++) {
            t[i] = td + i * 256;
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_128(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 128U);
        if (reduceA != 0) {
            err = sp_4096_mod_128(t[1] + 128, a, m);
            if (err == MP_OKAY) {
                err = sp_4096_mod_128(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 128, a, sizeof(sp_digit) * 128);
            err = sp_4096_mod_128(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_4096_mont_sqr_128(t[ 2], t[ 1], m, mp);
        sp_4096_mont_mul_128(t[ 3], t[ 2], t[ 1], m, mp);
        sp_4096_mont_sqr_128(t[ 4], t[ 2], m, mp);
        sp_4096_mont_mul_128(t[ 5], t[ 3], t[ 2], m, mp);
        sp_4096_mont_sqr_128(t[ 6], t[ 3], m, mp);
        sp_4096_mont_mul_128(t[ 7], t[ 4], t[ 3], m, mp);
        sp_4096_mont_sqr_128(t[ 8], t[ 4], m, mp);
        sp_4096_mont_mul_128(t[ 9], t[ 5], t[ 4], m, mp);
        sp_4096_mont_sqr_128(t[10], t[ 5], m, mp);
        sp_4096_mont_mul_128(t[11], t[ 6], t[ 5], m, mp);
        sp_4096_mont_sqr_128(t[12], t[ 6], m, mp);
        sp_4096_mont_mul_128(t[13], t[ 7], t[ 6], m, mp);
        sp_4096_mont_sqr_128(t[14], t[ 7], m, mp);
        sp_4096_mont_mul_128(t[15], t[ 8], t[ 7], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 4;
        if (c == 32) {
            c = 28;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 128);
        for (; i>=0 || c>=4; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 28);
                n <<= 4;
                c = 28;
            }
            else if (c < 4) {
                y = (byte)(n >> 28);
                n = e[i--];
                c = 4 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }

            sp_4096_mont_sqr_128(r, r, m, mp);
            sp_4096_mont_sqr_128(r, r, m, mp);
            sp_4096_mont_sqr_128(r, r, m, mp);
            sp_4096_mont_sqr_128(r, r, m, mp);

            sp_4096_mont_mul_128(r, r, t[y], m, mp);
        }

        XMEMSET(&r[128], 0, sizeof(sp_digit) * 128U);
        sp_4096_mont_reduce_128(r, m, mp);

        mask = (sp_digit)0 - (sp_4096_cmp_128(r, m) >= 0);
        sp_4096_cond_sub_128(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* WOLFSSL_SP_SMALL */
#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 512 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
    const mp_int* mm, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[128 * 5];
#endif
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_digit *ah = NULL;
    sp_digit e[1] = {0};
    int err = MP_OKAY;

    if (*outLen < 512) {
        err = MP_TO_E;
    }
    else if (mp_count_bits(em) > 32 || inLen > 512 ||
                                                     mp_count_bits(mm) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mm)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 128 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        ah = a + 128;
        r = a + 128 * 2;
        m = r + 128 * 2;

        sp_4096_from_bin(ah, 128, in, inLen);
#if DIGIT_BIT >= 32
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }
    if (err == MP_OKAY) {
        sp_4096_from_mp(m, 128, mm);

        if (e[0] == 0x10001) {
            int i;
            sp_digit mp;

            sp_4096_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 128);
            err = sp_4096_mod_128_cond(r, a, m);
            /* Montgomery form: r = a.R mod m */

            if (err == MP_OKAY) {
                /* r = a ^ 0x10000 => r = a squared 16 times */
                for (i = 15; i >= 0; i--) {
                    sp_4096_mont_sqr_128(r, r, m, mp);
                }
                /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m
                 * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m
                 */
                sp_4096_mont_mul_128(r, r, ah, m, mp);

                for (i = 127; i > 0; i--) {
                    if (r[i] != m[i]) {
                        break;
                    }
                }
                if (r[i] >= m[i]) {
                    sp_4096_sub_in_place_128(r, m);
                }
            }
        }
        else if (e[0] == 0x3) {
            if (err == MP_OKAY) {
                sp_4096_sqr_128(r, ah);
                err = sp_4096_mod_128_cond(r, r, m);
            }
            if (err == MP_OKAY) {
                sp_4096_mul_128(r, ah, r);
                err = sp_4096_mod_128_cond(r, r, m);
            }
        }
        else {
            int i;
            sp_digit mp;

            sp_4096_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 128);
            err = sp_4096_mod_128_cond(a, a, m);

            if (err == MP_OKAY) {
                for (i = 31; i >= 0; i--) {
                    if (e[0] >> i) {
                        break;
                    }
                }

                XMEMCPY(r, a, sizeof(sp_digit) * 128);
                for (i--; i >= 0; i--) {
                    sp_4096_mont_sqr_128(r, r, m, mp);
                    if (((e[0] >> i) & 1) == 1) {
                        sp_4096_mont_mul_128(r, r, a, m, mp);
                    }
                }
                XMEMSET(&r[128], 0, sizeof(sp_digit) * 128);
                sp_4096_mont_reduce_128(r, m, mp);

                for (i = 127; i > 0; i--) {
                    if (r[i] != m[i]) {
                        break;
                    }
                }
                if (r[i] >= m[i]) {
                    sp_4096_sub_in_place_128(r, m);
                }
            }
        }
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_128(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
}

#ifndef WOLFSSL_RSA_PUBLIC_ONLY
#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r5, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r4, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_4096_cond_add_64_words:\n\t"
#else
    "L_sp_4096_cond_add_64_words_%=:\n\t"
#endif
        "ADDS	r5, r5, #0xffffffff\n\t"
        "LDR	r6, [%[a], r4]\n\t"
        "LDR	r7, [%[b], r4]\n\t"
        "AND	r7, r7, %[m]\n\t"
        "ADCS	r6, r6, r7\n\t"
        "ADC	r5, r8, r8\n\t"
        "STR	r6, [%[r], r4]\n\t"
        "ADD	r4, r4, #0x4\n\t"
        "CMP	r4, #0x100\n\t"
#if defined(__GNUC__)
        "BLT	L_sp_4096_cond_add_64_words_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_4096_cond_add_64_words\n\t"
#else
        "BLT.N	L_sp_4096_cond_add_64_words_%=\n\t"
#endif
        "MOV	%[r], r5\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "cc"
    );
    return (word32)(size_t)r;
}

#else
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p)
#else
static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
    register sp_digit m __asm__ ("r3") = (sp_digit)m_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "MOV	r10, #0x0\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADDS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "LDM	%[a]!, {r6, r7}\n\t"
        "LDM	%[b]!, {r8, r9}\n\t"
        "AND	r8, r8, %[m]\n\t"
        "AND	r9, r9, %[m]\n\t"
        "ADCS	r6, r6, r8\n\t"
        "ADCS	r7, r7, r9\n\t"
        "STM	%[r]!, {r6, r7}\n\t"
        "ADC	%[r], r10, r10\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
    );
    return (word32)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 512 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
{
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit  d[128 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 4096) {
           err = MP_READ_E;
        }
        else if (inLen > 512) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 128 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 128;
        m = a + 256;
        r = a;

        sp_4096_from_bin(a, 128, in, inLen);
        sp_4096_from_mp(d, 128, dm);
        sp_4096_from_mp(m, 128, mm);
        err = sp_4096_mod_exp_128(r, a, d, 4096, m, 0);
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_128(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 128);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[64 * 11];
#endif
    sp_digit* p = NULL;
    sp_digit* q = NULL;
    sp_digit* dp = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    sp_digit* qi = NULL;
    sp_digit* dq = NULL;
    sp_digit c;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 512) {
        err = MP_TO_E;
    }
    else if (inLen > 512 || mp_count_bits(mm) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mm)) {
        err = MP_VAL;
    }
    else if (mp_iseven(pm)) {
        err = MP_VAL;
    }
    else if (mp_iseven(qm)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 64 * 11, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = a + 128 * 2;
        q = p + 64;
        qi = dq = dp = q + 64;
        tmpa = qi + 64;
        tmpb = tmpa + 128;
        r = a;

        sp_4096_from_bin(a, 128, in, inLen);
        sp_4096_from_mp(p, 64, pm);
        sp_4096_from_mp(q, 64, qm);
        sp_4096_from_mp(dp, 64, dpm);

        err = sp_2048_mod_exp_64(tmpa, a, dp, 2048, p, 1);
    }
    if (err == MP_OKAY) {
        sp_4096_from_mp(dq, 64, dqm);
        err = sp_2048_mod_exp_64(tmpb, a, dq, 2048, q, 1);
    }

    if (err == MP_OKAY) {
        c = sp_2048_sub_in_place_64(tmpa, tmpb);
        c += sp_4096_cond_add_64(tmpa, tmpa, p, c);
        sp_4096_cond_add_64(tmpa, tmpa, p, c);

        sp_2048_from_mp(qi, 64, qim);
        sp_2048_mul_64(tmpa, tmpa, qi);
        err = sp_2048_mod_64(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_2048_mul_64(tmpa, q, tmpa);
        XMEMSET(&tmpb[64], 0, sizeof(sp_digit) * 64);
        sp_4096_add_128(r, tmpb, tmpa);

        sp_4096_to_bin_128(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 64 * 11);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
    #endif
    }
#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
    return err;
}
#endif /* WOLFSSL_RSA_PUBLIC_ONLY */
#endif /* WOLFSSL_HAVE_SP_RSA */
#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (4096 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 32
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 128);
        r->used = 128;
        mp_clamp(r);
#elif DIGIT_BIT < 32
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 128; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
        }
        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 128; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 32 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 32 - s;
            }
            else {
                s += 32;
            }
        }
        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[256];
    sp_digit e[128];
    sp_digit m[128];
    sp_digit* r = b;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 4096) {
        err = MP_READ_E;
    }
    else if (expBits > 4096) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_4096_from_mp(b, 128, base);
        sp_4096_from_mp(e, 128, exp);
        sp_4096_from_mp(m, 128, mod);

        err = sp_4096_mod_exp_128(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_4096_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

#ifdef WOLFSSL_HAVE_SP_DH

#ifdef HAVE_FFDHE_4096
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_4096_lshift_128(sp_digit* r_p, const sp_digit* a_p, byte n_p)
#else
static void sp_4096_lshift_128(sp_digit* r, const sp_digit* a, byte n)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register byte n __asm__ ("r2") = (byte)n_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "RSB	r7, %[n], #0x1f\n\t"
        "LDR	r5, [%[a], #508]\n\t"
        "LSR	r6, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r6, r6, r7\n\t"
        "LDR	r4, [%[a], #504]\n\t"
        "STR	r6, [%[r], #512]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #500]\n\t"
        "STR	r5, [%[r], #508]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #496]\n\t"
        "STR	r4, [%[r], #504]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #492]\n\t"
        "STR	r6, [%[r], #500]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #488]\n\t"
        "STR	r5, [%[r], #496]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #484]\n\t"
        "STR	r4, [%[r], #492]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #480]\n\t"
        "STR	r6, [%[r], #488]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #476]\n\t"
        "STR	r5, [%[r], #484]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #472]\n\t"
        "STR	r4, [%[r], #480]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #468]\n\t"
        "STR	r6, [%[r], #476]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #464]\n\t"
        "STR	r5, [%[r], #472]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #460]\n\t"
        "STR	r4, [%[r], #468]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #456]\n\t"
        "STR	r6, [%[r], #464]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #452]\n\t"
        "STR	r5, [%[r], #460]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #448]\n\t"
        "STR	r4, [%[r], #456]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #444]\n\t"
        "STR	r6, [%[r], #452]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #440]\n\t"
        "STR	r5, [%[r], #448]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #436]\n\t"
        "STR	r4, [%[r], #444]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #432]\n\t"
        "STR	r6, [%[r], #440]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #428]\n\t"
        "STR	r5, [%[r], #436]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #424]\n\t"
        "STR	r4, [%[r], #432]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #420]\n\t"
        "STR	r6, [%[r], #428]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #416]\n\t"
        "STR	r5, [%[r], #424]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #412]\n\t"
        "STR	r4, [%[r], #420]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #408]\n\t"
        "STR	r6, [%[r], #416]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #404]\n\t"
        "STR	r5, [%[r], #412]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #400]\n\t"
        "STR	r4, [%[r], #408]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #396]\n\t"
        "STR	r6, [%[r], #404]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #392]\n\t"
        "STR	r5, [%[r], #400]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #388]\n\t"
        "STR	r4, [%[r], #396]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #384]\n\t"
        "STR	r6, [%[r], #392]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #380]\n\t"
        "STR	r5, [%[r], #388]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #376]\n\t"
        "STR	r4, [%[r], #384]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #372]\n\t"
        "STR	r6, [%[r], #380]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #368]\n\t"
        "STR	r5, [%[r], #376]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #364]\n\t"
        "STR	r4, [%[r], #372]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #360]\n\t"
        "STR	r6, [%[r], #368]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #356]\n\t"
        "STR	r5, [%[r], #364]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #352]\n\t"
        "STR	r4, [%[r], #360]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #348]\n\t"
        "STR	r6, [%[r], #356]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #344]\n\t"
        "STR	r5, [%[r], #352]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #340]\n\t"
        "STR	r4, [%[r], #348]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #336]\n\t"
        "STR	r6, [%[r], #344]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #332]\n\t"
        "STR	r5, [%[r], #340]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #328]\n\t"
        "STR	r4, [%[r], #336]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #324]\n\t"
        "STR	r6, [%[r], #332]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #320]\n\t"
        "STR	r5, [%[r], #328]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #316]\n\t"
        "STR	r4, [%[r], #324]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #312]\n\t"
        "STR	r6, [%[r], #320]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #308]\n\t"
        "STR	r5, [%[r], #316]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #304]\n\t"
        "STR	r4, [%[r], #312]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #300]\n\t"
        "STR	r6, [%[r], #308]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #296]\n\t"
        "STR	r5, [%[r], #304]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #292]\n\t"
        "STR	r4, [%[r], #300]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #288]\n\t"
        "STR	r6, [%[r], #296]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #284]\n\t"
        "STR	r5, [%[r], #292]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #280]\n\t"
        "STR	r4, [%[r], #288]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #276]\n\t"
        "STR	r6, [%[r], #284]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #272]\n\t"
        "STR	r5, [%[r], #280]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #268]\n\t"
        "STR	r4, [%[r], #276]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #264]\n\t"
        "STR	r6, [%[r], #272]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #260]\n\t"
        "STR	r5, [%[r], #268]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #256]\n\t"
        "STR	r4, [%[r], #264]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #252]\n\t"
        "STR	r6, [%[r], #260]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #248]\n\t"
        "STR	r5, [%[r], #256]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #244]\n\t"
        "STR	r4, [%[r], #252]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #240]\n\t"
        "STR	r6, [%[r], #248]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #236]\n\t"
        "STR	r5, [%[r], #244]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #232]\n\t"
        "STR	r4, [%[r], #240]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #228]\n\t"
        "STR	r6, [%[r], #236]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #224]\n\t"
        "STR	r5, [%[r], #232]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #220]\n\t"
        "STR	r4, [%[r], #228]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #216]\n\t"
        "STR	r6, [%[r], #224]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #212]\n\t"
        "STR	r5, [%[r], #220]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #208]\n\t"
        "STR	r4, [%[r], #216]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #204]\n\t"
        "STR	r6, [%[r], #212]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #200]\n\t"
        "STR	r5, [%[r], #208]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #196]\n\t"
        "STR	r4, [%[r], #204]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #192]\n\t"
        "STR	r6, [%[r], #200]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #188]\n\t"
        "STR	r5, [%[r], #196]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #184]\n\t"
        "STR	r4, [%[r], #192]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #180]\n\t"
        "STR	r6, [%[r], #188]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #176]\n\t"
        "STR	r5, [%[r], #184]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #172]\n\t"
        "STR	r4, [%[r], #180]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #168]\n\t"
        "STR	r6, [%[r], #176]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #164]\n\t"
        "STR	r5, [%[r], #172]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #160]\n\t"
        "STR	r4, [%[r], #168]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #156]\n\t"
        "STR	r6, [%[r], #164]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #152]\n\t"
        "STR	r5, [%[r], #160]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #148]\n\t"
        "STR	r4, [%[r], #156]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #144]\n\t"
        "STR	r6, [%[r], #152]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #140]\n\t"
        "STR	r5, [%[r], #148]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #136]\n\t"
        "STR	r4, [%[r], #144]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #132]\n\t"
        "STR	r6, [%[r], #140]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #128]\n\t"
        "STR	r5, [%[r], #136]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #124]\n\t"
        "STR	r4, [%[r], #132]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #120]\n\t"
        "STR	r6, [%[r], #128]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #116]\n\t"
        "STR	r5, [%[r], #124]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #112]\n\t"
        "STR	r4, [%[r], #120]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #108]\n\t"
        "STR	r6, [%[r], #116]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #104]\n\t"
        "STR	r5, [%[r], #112]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #100]\n\t"
        "STR	r4, [%[r], #108]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #96]\n\t"
        "STR	r6, [%[r], #104]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #92]\n\t"
        "STR	r5, [%[r], #100]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #88]\n\t"
        "STR	r4, [%[r], #96]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #84]\n\t"
        "STR	r6, [%[r], #92]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #80]\n\t"
        "STR	r5, [%[r], #88]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #76]\n\t"
        "STR	r4, [%[r], #84]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #72]\n\t"
        "STR	r6, [%[r], #80]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #68]\n\t"
        "STR	r5, [%[r], #76]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #64]\n\t"
        "STR	r4, [%[r], #72]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #60]\n\t"
        "STR	r6, [%[r], #68]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #56]\n\t"
        "STR	r5, [%[r], #64]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #52]\n\t"
        "STR	r4, [%[r], #60]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #48]\n\t"
        "STR	r6, [%[r], #56]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #44]\n\t"
        "STR	r5, [%[r], #52]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #40]\n\t"
        "STR	r4, [%[r], #48]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #36]\n\t"
        "STR	r6, [%[r], #44]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #32]\n\t"
        "STR	r5, [%[r], #40]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #28]\n\t"
        "STR	r4, [%[r], #36]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #24]\n\t"
        "STR	r6, [%[r], #32]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #20]\n\t"
        "STR	r5, [%[r], #28]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #16]\n\t"
        "STR	r4, [%[r], #24]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a], #12]\n\t"
        "STR	r6, [%[r], #20]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "LDR	r6, [%[a], #8]\n\t"
        "STR	r5, [%[r], #16]\n\t"
        "LSR	r3, r6, #1\n\t"
        "LSL	r6, r6, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r4, r4, r3\n\t"
        "LDR	r5, [%[a], #4]\n\t"
        "STR	r4, [%[r], #12]\n\t"
        "LSR	r3, r5, #1\n\t"
        "LSL	r5, r5, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r6, r6, r3\n\t"
        "LDR	r4, [%[a]]\n\t"
        "STR	r6, [%[r], #8]\n\t"
        "LSR	r3, r4, #1\n\t"
        "LSL	r4, r4, %[n]\n\t"
        "LSR	r3, r3, r7\n\t"
        "ORR	r5, r5, r3\n\t"
        "STR	r4, [%[r]]\n\t"
        "STR	r5, [%[r], #4]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
        :
        : "memory", "r4", "r5", "r6", "r3", "r7", "cc"
    );
}

/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even.
 */
static int sp_4096_mod_exp_2_128(sp_digit* r, const sp_digit* e, int bits,
        const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[385];
#endif
    sp_digit* norm = NULL;
    sp_digit* tmp = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit o;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 385, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        tmp = td + 256;

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_128(norm, m);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 5;
        if (c == 32) {
            c = 27;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        sp_4096_lshift_128(r, norm, y);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 27);
                n <<= 5;
                c = 27;
            }
            else if (c < 5) {
                y = (byte)(n >> 27);
                n = e[i--];
                c = 5 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }

            sp_4096_mont_sqr_128(r, r, m, mp);
            sp_4096_mont_sqr_128(r, r, m, mp);
            sp_4096_mont_sqr_128(r, r, m, mp);
            sp_4096_mont_sqr_128(r, r, m, mp);
            sp_4096_mont_sqr_128(r, r, m, mp);

            sp_4096_lshift_128(r, r, y);
            sp_4096_mul_d_128(tmp, norm, r[128]);
            r[128] = 0;
            o = sp_4096_add_128(r, r, tmp);
            sp_4096_cond_sub_128(r, r, m, (sp_digit)0 - o);
        }

        XMEMSET(&r[128], 0, sizeof(sp_digit) * 128U);
        sp_4096_mont_reduce_128(r, m, mp);

        mask = (sp_digit)0 - (sp_4096_cmp_128(r, m) >= 0);
        sp_4096_cond_sub_128(r, r, m, mask);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* HAVE_FFDHE_4096 */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 512 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
    const mp_int* mod, byte* out, word32* outLen)
{
    int err = MP_OKAY;
    sp_digit b[256];
    sp_digit e[128];
    sp_digit m[128];
    sp_digit* r = b;
    word32 i;

    if (mp_count_bits(base) > 4096) {
        err = MP_READ_E;
    }
    else if (expLen > 512) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_4096_from_mp(b, 128, base);
        sp_4096_from_bin(e, 128, exp, expLen);
        sp_4096_from_mp(m, 128, mod);

    #ifdef HAVE_FFDHE_4096
        if (base->used == 1 && base->dp[0] == 2 && m[127] == (sp_digit)-1)
            err = sp_4096_mod_exp_2_128(r, e, expLen * 8, m);
        else
    #endif
            err = sp_4096_mod_exp_128(r, b, e, expLen * 8, m, 0);

    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_128(r, out);
        *outLen = 512;
        for (i=0; i<512 && out[i] == 0; i++) {
            /* Search for first non-zero. */
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);

    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */

#endif /* WOLFSSL_SP_4096 */

#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_ECC
#ifndef WOLFSSL_SP_NO_256

/* Point structure to use. */
typedef struct sp_point_256 {
    /* X ordinate of point. */
    sp_digit x[2 * 8];
    /* Y ordinate of point. */
    sp_digit y[2 * 8];
    /* Z ordinate of point. */
    sp_digit z[2 * 8];
    /* Indicates point is at infinity. */
    int infinity;
} sp_point_256;

/* The modulus (prime) of the curve P256. */
static const sp_digit p256_mod[8] = {
    0xffffffff,0xffffffff,0xffffffff,0x00000000,0x00000000,0x00000000,
    0x00000001,0xffffffff
};
/* The Montgomery normalizer for modulus of the curve P256. */
static const sp_digit p256_norm_mod[8] = {
    0x00000001,0x00000000,0x00000000,0xffffffff,0xffffffff,0xffffffff,
    0xfffffffe,0x00000000
};
/* The Montgomery multiplier for modulus of the curve P256. */
static const sp_digit p256_mp_mod = 0x00000001;
#if defined(WOLFSSL_VALIDATE_ECC_KEYGEN) || defined(HAVE_ECC_SIGN) || \
                                            defined(HAVE_ECC_VERIFY)
/* The order of the curve P256. */
static const sp_digit p256_order[8] = {
    0xfc632551,0xf3b9cac2,0xa7179e84,0xbce6faad,0xffffffff,0xffffffff,
    0x00000000,0xffffffff
};
#endif
/* The order of the curve P256 minus 2. */
static const sp_digit p256_order2[8] = {
    0xfc63254f,0xf3b9cac2,0xa7179e84,0xbce6faad,0xffffffff,0xffffffff,
    0x00000000,0xffffffff
};
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
/* The Montgomery normalizer for order of the curve P256. */
static const sp_digit p256_norm_order[8] = {
    0x039cdaaf,0x0c46353d,0x58e8617b,0x43190552,0x00000000,0x00000000,
    0xffffffff,0x00000000
};
#endif
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
/* The Montgomery multiplier for order of the curve P256. */
static const sp_digit p256_mp_order = 0xee00bc4f;
#endif
/* The base point of curve P256. */
static const sp_point_256 p256_base = {
    /* X ordinate */
    {
        0xd898c296,0xf4a13945,0x2deb33a0,0x77037d81,0x63a440f2,0xf8bce6e5,
        0xe12c4247,0x6b17d1f2,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0
    },
    /* Y ordinate */
    {
        0x37bf51f5,0xcbb64068,0x6b315ece,0x2bce3357,0x7c0f9e16,0x8ee7eb4a,
        0xfe1a7f9b,0x4fe342e2,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0
    },
    /* Z ordinate */
    {
        0x00000001,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,
        0x00000000,0x00000000,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0
    },
    /* infinity */
    0
};
#if defined(HAVE_ECC_CHECK_KEY) || defined(HAVE_COMP_KEY)
static const sp_digit p256_b[8] = {
    0x27d2604b,0x3bce3c3e,0xcc53b0f6,0x651d06b0,0x769886bc,0xb3ebbd55,
    0xaa3a93e7,0x5ac635d8
};
#endif

#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x40\n\t"
        "LDR	lr, [%[a]]\n\t"
        "LDR	r11, [%[b]]\n\t"
        "UMULL	r8, r6, lr, r11\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_256_mul_8_outer:\n\t"
#else
    "L_sp_256_mul_8_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0x1c\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_256_mul_8_inner:\n\t"
#else
    "L_sp_256_mul_8_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "LDR	lr, [%[a], r4]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_256_mul_8_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_256_mul_8_inner_done\n\t"
#else
        "BGT.N	L_sp_256_mul_8_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_256_mul_8_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_256_mul_8_inner\n\t"
#else
        "BLT.N	L_sp_256_mul_8_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[b], r3]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_256_mul_8_inner_done:\n\t"
#else
    "L_sp_256_mul_8_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x34\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_256_mul_8_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_256_mul_8_outer\n\t"
#else
        "BLE.N	L_sp_256_mul_8_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #28]\n\t"
        "LDR	r11, [%[b], #28]\n\t"
        "UMLAL	r6, r7, lr, r11\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_256_mul_8_store:\n\t"
#else
    "L_sp_256_mul_8_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_256_mul_8_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_256_mul_8_store\n\t"
#else
        "BGT.N	L_sp_256_mul_8_store_%=\n\t"
#endif
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "cc"
    );
}

#else
#ifdef WOLFSSL_ARM_ARCH_7M
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x24\n\t"
        "STR	%[r], [sp, #32]\n\t"
        "MOV	%[r], #0x0\n\t"
        "LDR	r12, [%[a]]\n\t"
        /* A[0] * B[0] */
        "LDR	lr, [%[b]]\n\t"
        "UMULL	r3, r4, r12, lr\n\t"
        /* A[0] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "UMULL	r5, r6, r12, lr\n\t"
        /* A[0] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "UMULL	r7, r8, r12, lr\n\t"
        /* A[0] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "UMULL	r9, r10, r12, lr\n\t"
        "STR	r3, [sp]\n\t"
        /* A[0] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "MOV	r11, %[r]\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[0] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADCS	r6, r6, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[0] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADCS	r8, r8, #0x0\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[0] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADCS	r10, r10, #0x0\n\t"
        "ADC	r3, %[r], #0x0\n\t"
        "UMLAL	r10, r3, r12, lr\n\t"
        /* A[1] * B[0] */
        "LDR	r12, [%[a], #4]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "STR	r4, [sp, #4]\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[1] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[1] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[1] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[1] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[1] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[1] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[1] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r4, %[r], #0x0\n\t"
        "UMLAL	r3, r4, r12, lr\n\t"
        /* A[2] * B[0] */
        "LDR	r12, [%[a], #8]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "STR	r5, [sp, #8]\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[2] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[2] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[2] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[2] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[2] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[2] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[2] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r5, %[r], #0x0\n\t"
        "UMLAL	r4, r5, r12, lr\n\t"
        /* A[3] * B[0] */
        "LDR	r12, [%[a], #12]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "STR	r6, [sp, #12]\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[3] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[3] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[3] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[3] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[3] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[3] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[3] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r6, %[r], #0x0\n\t"
        "UMLAL	r5, r6, r12, lr\n\t"
        /* A[4] * B[0] */
        "LDR	r12, [%[a], #16]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "STR	r7, [sp, #16]\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[4] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[4] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[4] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[4] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[4] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[4] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[4] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r7, %[r], #0x0\n\t"
        "UMLAL	r6, r7, r12, lr\n\t"
        /* A[5] * B[0] */
        "LDR	r12, [%[a], #20]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "STR	r8, [sp, #20]\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[5] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[5] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[5] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[5] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[5] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[5] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[5] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r8, %[r], #0x0\n\t"
        "UMLAL	r7, r8, r12, lr\n\t"
        /* A[6] * B[0] */
        "LDR	r12, [%[a], #24]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r9, r11, r12, lr\n\t"
        "STR	r9, [sp, #24]\n\t"
        "ADDS	r10, r10, r11\n\t"
        /* A[6] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[6] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[6] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[6] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[6] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[6] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[6] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r9, %[r], #0x0\n\t"
        "UMLAL	r8, r9, r12, lr\n\t"
        /* A[7] * B[0] */
        "LDR	r12, [%[a], #28]\n\t"
        "LDR	lr, [%[b]]\n\t"
        "MOV	r11, #0x0\n\t"
        "UMLAL	r10, r11, r12, lr\n\t"
        "STR	r10, [sp, #28]\n\t"
        "ADDS	r3, r3, r11\n\t"
        /* A[7] * B[1] */
        "LDR	lr, [%[b], #4]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r3, r11, r12, lr\n\t"
        "ADDS	r4, r4, r11\n\t"
        /* A[7] * B[2] */
        "LDR	lr, [%[b], #8]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r4, r11, r12, lr\n\t"
        "ADDS	r5, r5, r11\n\t"
        /* A[7] * B[3] */
        "LDR	lr, [%[b], #12]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r5, r11, r12, lr\n\t"
        "ADDS	r6, r6, r11\n\t"
        /* A[7] * B[4] */
        "LDR	lr, [%[b], #16]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r6, r11, r12, lr\n\t"
        "ADDS	r7, r7, r11\n\t"
        /* A[7] * B[5] */
        "LDR	lr, [%[b], #20]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r7, r11, r12, lr\n\t"
        "ADDS	r8, r8, r11\n\t"
        /* A[7] * B[6] */
        "LDR	lr, [%[b], #24]\n\t"
        "ADC	r11, %[r], #0x0\n\t"
        "UMLAL	r8, r11, r12, lr\n\t"
        "ADDS	r9, r9, r11\n\t"
        /* A[7] * B[7] */
        "LDR	lr, [%[b], #28]\n\t"
        "ADC	r10, %[r], #0x0\n\t"
        "UMLAL	r9, r10, r12, lr\n\t"
        "LDR	%[r], [sp, #32]\n\t"
        "ADD	%[r], %[r], #0x20\n\t"
        "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "LDM	sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "SUB	%[r], %[r], #0x20\n\t"
        "STM	%[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "ADD	sp, sp, #0x24\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc"
    );
}

#else
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
SP_NOINLINE static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p)
#else
SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
    register const sp_digit* b __asm__ ("r2") = (const sp_digit*)b_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x2c\n\t"
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
        "STRD	%[r], %[a], [sp, #36]\n\t"
#else
        "STR	%[r], [sp, #36]\n\t"
        "STR	%[a], [sp, #40]\n\t"
#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
        "MOV	lr, %[b]\n\t"
        "LDM	%[a], {r0, r1, r2, r3}\n\t"
        "LDM	lr!, {r4, r5, r6}\n\t"
        "UMULL	r10, r11, r0, r4\n\t"
        "UMULL	r12, r7, r1, r4\n\t"
        "UMAAL	r11, r12, r0, r5\n\t"
        "UMULL	r8, r9, r2, r4\n\t"
        "UMAAL	r12, r8, r1, r5\n\t"
        "UMAAL	r12, r7, r0, r6\n\t"
        "UMAAL	r8, r9, r3, r4\n\t"
        "STM	sp, {r10, r11, r12}\n\t"
        "UMAAL	r7, r8, r2, r5\n\t"
        "LDM	lr!, {r4}\n\t"
        "UMULL	r10, r11, r1, r6\n\t"
        "UMAAL	r8, r9, r2, r6\n\t"
        "UMAAL	r7, r10, r0, r4\n\t"
        "UMAAL	r8, r11, r3, r5\n\t"
        "STR	r7, [sp, #12]\n\t"
        "UMAAL	r8, r10, r1, r4\n\t"
        "UMAAL	r9, r11, r3, r6\n\t"
        "UMAAL	r9, r10, r2, r4\n\t"
        "UMAAL	r10, r11, r3, r4\n\t"
        "LDM	lr, {r4, r5, r6, r7}\n\t"
        "MOV	r12, #0x0\n\t"
        "UMLAL	r8, r12, r0, r4\n\t"
        "UMAAL	r9, r12, r1, r4\n\t"
        "UMAAL	r10, r12, r2, r4\n\t"
        "UMAAL	r11, r12, r3, r4\n\t"
        "MOV	r4, #0x0\n\t"
        "UMLAL	r9, r4, r0, r5\n\t"
        "UMAAL	r10, r4, r1, r5\n\t"
        "UMAAL	r11, r4, r2, r5\n\t"
        "UMAAL	r12, r4, r3, r5\n\t"
        "MOV	r5, #0x0\n\t"
        "UMLAL	r10, r5, r0, r6\n\t"
        "UMAAL	r11, r5, r1, r6\n\t"
        "UMAAL	r12, r5, r2, r6\n\t"
        "UMAAL	r4, r5, r3, r6\n\t"
        "MOV	r6, #0x0\n\t"
        "UMLAL	r11, r6, r0, r7\n\t"
        "LDR	r0, [sp, #40]\n\t"
        "UMAAL	r12, r6, r1, r7\n\t"
        "ADD	r0, r0, #0x10\n\t"
        "UMAAL	r4, r6, r2, r7\n\t"
        "SUB	lr, lr, #0x10\n\t"
        "UMAAL	r5, r6, r3, r7\n\t"
        "LDM	r0, {r0, r1, r2, r3}\n\t"
        "STR	r6, [sp, #32]\n\t"
        "LDM	lr!, {r6}\n\t"
        "MOV	r7, #0x0\n\t"
        "UMLAL	r8, r7, r0, r6\n\t"
        "UMAAL	r9, r7, r1, r6\n\t"
        "STR	r8, [sp, #16]\n\t"
        "UMAAL	r10, r7, r2, r6\n\t"
        "UMAAL	r11, r7, r3, r6\n\t"
        "LDM	lr!, {r6}\n\t"
        "MOV	r8, #0x0\n\t"
        "UMLAL	r9, r8, r0, r6\n\t"
        "UMAAL	r10, r8, r1, r6\n\t"
        "STR	r9, [sp, #20]\n\t"
        "UMAAL	r11, r8, r2, r6\n\t"
        "UMAAL	r12, r8, r3, r6\n\t"
        "LDM	lr!, {r6}\n\t"
        "MOV	r9, #0x0\n\t"
        "UMLAL	r10, r9, r0, r6\n\t"
        "UMAAL	r11, r9, r1, r6\n\t"
        "STR	r10, [sp, #24]\n\t"
        "UMAAL	r12, r9, r2, r6\n\t"
        "UMAAL	r4, r9, r3, r6\n\t"
        "LDM	lr!, {r6}\n\t"
        "MOV	r10, #0x0\n\t"
        "UMLAL	r11, r10, r0, r6\n\t"
        "UMAAL	r12, r10, r1, r6\n\t"
        "STR	r11, [sp, #28]\n\t"
        "UMAAL	r4, r10, r2, r6\n\t"
        "UMAAL	r5, r10, r3, r6\n\t"
        "LDM	lr!, {r11}\n\t"
        "UMAAL	r12, r7, r0, r11\n\t"
        "UMAAL	r4, r7, r1, r11\n\t"
        "LDR	r6, [sp, #32]\n\t"
        "UMAAL	r5, r7, r2, r11\n\t"
        "UMAAL	r6, r7, r3, r11\n\t"
        "LDM	lr!, {r11}\n\t"
        "UMAAL	r4, r8, r0, r11\n\t"
        "UMAAL	r5, r8, r1, r11\n\t"
        "UMAAL	r6, r8, r2, r11\n\t"
        "UMAAL	r7, r8, r3, r11\n\t"
        "LDM	lr, {r11, lr}\n\t"
        "UMAAL	r5, r9, r0, r11\n\t"
        "UMAAL	r6, r10, r0, lr\n\t"
        "UMAAL	r6, r9, r1, r11\n\t"
        "UMAAL	r7, r10, r1, lr\n\t"
        "UMAAL	r7, r9, r2, r11\n\t"
        "UMAAL	r8, r10, r2, lr\n\t"
        "UMAAL	r8, r9, r3, r11\n\t"
        "UMAAL	r9, r10, r3, lr\n\t"
        "MOV	r3, r12\n\t"
        "LDR	lr, [sp, #36]\n\t"
        "ADD	lr, lr, #0x20\n\t"
        "STM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "SUB	lr, lr, #0x20\n\t"
        "LDM	sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "STM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t"
        "ADD	sp, sp, #0x2c\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr", "cc"
    );
}

#endif /* WOLFSSL_ARM_ARCH_7M */
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
#else
static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
{
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
    register sp_digit* r __asm__ ("r0") = (sp_digit*)r_p;
    register const sp_digit* a __asm__ ("r1") = (const sp_digit*)a_p;
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */

    __asm__ __volatile__ (
        "SUB	sp, sp, #0x40\n\t"
        "LDR	lr, [%[a]]\n\t"
        "UMULL	r8, r6, lr, lr\n\t"
        "STR	r8, [sp]\n\t"
        "MOV	r7, #0x0\n\t"
        "MOV	r8, #0x0\n\t"
        "MOV	r5, #0x4\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_256_sqr_8_outer:\n\t"
#else
    "L_sp_256_sqr_8_outer_%=:\n\t"
#endif
        "SUBS	r3, r5, #0x1c\n\t"
        "IT	cc\n\t"
        "MOVCC	r3, #0x0\n\t"
        "SUB	r4, r5, r3\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_256_sqr_8_inner:\n\t"
#else
    "L_sp_256_sqr_8_inner_%=:\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "LDR	r11, [%[a], r4]\n\t"
        "UMULL	r9, r10, lr, r11\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "ADD	r3, r3, #0x4\n\t"
        "SUB	r4, r4, #0x4\n\t"
        "CMP	r3, r4\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_256_sqr_8_inner_done_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_256_sqr_8_inner_done\n\t"
#else
        "BGT.N	L_sp_256_sqr_8_inner_done_%=\n\t"
#endif
#if defined(__GNUC__)
        "BLT	L_sp_256_sqr_8_inner_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLT.N	L_sp_256_sqr_8_inner\n\t"
#else
        "BLT.N	L_sp_256_sqr_8_inner_%=\n\t"
#endif
        "LDR	lr, [%[a], r3]\n\t"
        "UMULL	r9, r10, lr, lr\n\t"
        "ADDS	r6, r6, r9\n\t"
        "ADCS	r7, r7, r10\n\t"
        "ADC	r8, r8, #0x0\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_256_sqr_8_inner_done:\n\t"
#else
    "L_sp_256_sqr_8_inner_done_%=:\n\t"
#endif
        "STR	r6, [sp, r5]\n\t"
        "MOV	r6, r7\n\t"
        "MOV	r7, r8\n\t"
        "MOV	r8, #0x0\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "CMP	r5, #0x34\n\t"
#if defined(__GNUC__)
        "BLE	L_sp_256_sqr_8_outer_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BLE.N	L_sp_256_sqr_8_outer\n\t"
#else
        "BLE.N	L_sp_256_sqr_8_outer_%=\n\t"
#endif
        "LDR	lr, [%[a], #28]\n\t"
        "UMLAL	r6, r7, lr, lr\n\t"
        "STR	r6, [sp, r5]\n\t"
        "ADD	r5, r5, #0x4\n\t"
        "STR	r7, [sp, r5]\n\t"
        "\n"
#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
    "L_sp_256_sqr_8_store:\n\t"
#else
    "L_sp_256_sqr_8_store_%=:\n\t"
#endif
        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
        "SUBS	r5, r5, #0x20\n\t"
#if defined(__GNUC__)
        "BGT	L_sp_256_sqr_8_store_%=\n\t"
#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000)
        "BGT.N	L_sp_256_sqr_8_store\n\t"
#else
        "BGT.