/* sp.c
 *
 * Copyright (C) 2006-2017 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

/* Implementation by Sean Parkinson. */

#ifdef HAVE_CONFIG_H
    #include <config.h>
#endif

#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/cpuid.h>
#ifdef NO_INLINE
    #include <wolfssl/wolfcrypt/misc.h>
#else
    #define WOLFSSL_MISC_INCLUDED
    #include <wolfcrypt/src/misc.c>
#endif

#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) || \
                                    defined(WOLFSSL_HAVE_SP_ECC)

#ifdef RSA_LOW_MEM
#define SP_RSA_PRIVATE_EXP_D

#ifndef WOLFSSL_SP_SMALL
#define WOLFSSL_SP_SMALL
#endif
#endif

#include <wolfssl/wolfcrypt/sp.h>

#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
    #define USE_INTEL_SP_SPEEDUP
#endif

#ifdef USE_INTEL_SP_SPEEDUP
    #define HAVE_INTEL_AVX1
    #define HAVE_INTEL_AVX2
#endif

#if !defined(WOLFSSL_X86_64_BUILD) || !defined(USE_INTEL_SPEEDUP)
#if defined(WOLFSSL_SP_CACHE_RESISTANT) || defined(WOLFSSL_SP_SMALL)
/* Mask for address to obfuscate which of the two address will be used. */
static const size_t addr_mask[2] = { 0, (size_t)-1 };
#endif
#endif

#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
#if !defined(WOLFSSL_X86_64_BUILD) || !defined(USE_INTEL_SPEEDUP)
#if SP_WORD_SIZE == 32
#ifndef WOLFSSL_SP_NO_2048
/* Read big endian unsigned byte aray into r.
 *
 * r  A single precision integer.
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_2048_from_bin(sp_digit* r, int max, const byte* a, int n)
{
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= ((sp_digit)a[i]) << s;
        if (s >= 13) {
            r[j] &= 0x1fffff;
            s = 21 - s;
            if (j + 1 >= max)
                break;
            r[++j] = a[i] >> s;
            s = 8 - s;
        }
        else
            s += 8;
    }

    for (j++; j < max; j++)
        r[j] = 0;
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * a  A multi-precision integer.
 */
static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a)
{
#if DIGIT_BIT == 21
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < max; j++)
        r[j] = 0;
#elif DIGIT_BIT > 21
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= a->dp[i] << s;
        r[j] &= 0x1fffff;
        s = 21 - s;
        if (j + 1 >= max)
            break;
        r[++j] = a->dp[i] >> s;
        while (s + 21 <= DIGIT_BIT) {
            s += 21;
            r[j] &= 0x1fffff;
            if (j + 1 >= max)
                break;
            if (s < DIGIT_BIT)
                r[++j] = a->dp[i] >> s;
            else
                r[++j] = 0;
        }
        s = DIGIT_BIT - s;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#else
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 21) {
            r[j] &= 0x1fffff;
            if (j + 1 >= max)
                break;
            s = 21 - s;
            r[++j] = a->dp[i] >> s;
            s = DIGIT_BIT - s;
        }
        else
            s += DIGIT_BIT;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#endif
}

/* Write r as big endian to byte aray.
 * Fixed length number of bytes written: 256
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_2048_to_bin(sp_digit* r, byte* a)
{
    int i, j, s = 0, b;

    for (i=0; i<97; i++) {
        r[i+1] += r[i] >> 21;
        r[i] &= 0x1fffff;
    }
    j = 2048 / 8 - 1;
    a[j] = 0;
    for (i=0; i<98 && j>=0; i++) {
        b = 0;
        a[j--] |= r[i] << s; b += 8 - s;
        if (j < 0)
            break;
        while (b < 21) {
            a[j--] = r[i] >> b; b += 8;
            if (j < 0)
                break;
        }
        if (j < 0)
            break;
        s = 8 - (b - 21);
        a[j] = 0;
        if (s != 0)
            j++;
    }
}

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_49(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j;
    int64_t t[98];

    XMEMSET(t, 0, sizeof(t));
    for (i=0; i<49; i++) {
        for (j=0; j<49; j++)
            t[i+j] += ((int64_t)a[i]) * b[j];
    }
    for (i=0; i<97; i++) {
        r[i] = t[i] & 0x1fffff;
        t[i+1] += t[i] >> 21;
    }
    r[97] = (sp_digit)t[97];
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_49(sp_digit* r, const sp_digit* a)
{
    int i, j;
    int64_t t[98];

    XMEMSET(t, 0, sizeof(t));
    for (i=0; i<49; i++) {
        for (j=0; j<i; j++)
            t[i+j] += (((int64_t)a[i]) * a[j]) * 2;
        t[i+i] += ((int64_t)a[i]) * a[i];
    }
    for (i=0; i<97; i++) {
        r[i] = t[i] & 0x1fffff;
        t[i+1] += t[i] >> 21;
    }
    r[97] = (sp_digit)t[97];
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_add_49(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[48] = a[48] + b[48];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_add_98(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 96; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[96] = a[96] + b[96];
    r[97] = a[97] + b[97];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_sub_98(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 96; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[96] = a[96] - b[96];
    r[97] = a[97] - b[97];

    return 0;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_98(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[98];
    sp_digit* a1 = z1;
    sp_digit b1[49];
    sp_digit* z2 = r + 98;
    sp_2048_add_49(a1, a, &a[49]);
    sp_2048_add_49(b1, b, &b[49]);
    sp_2048_mul_49(z2, &a[49], &b[49]);
    sp_2048_mul_49(z0, a, b);
    sp_2048_mul_49(z1, a1, b1);
    sp_2048_sub_98(z1, z1, z2);
    sp_2048_sub_98(z1, z1, z0);
    sp_2048_add_98(r + 49, r + 49, z1);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_98(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[98];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 98;
    sp_2048_add_49(a1, a, &a[49]);
    sp_2048_sqr_49(z2, &a[49]);
    sp_2048_sqr_49(z0, a);
    sp_2048_sqr_49(z1, a1);
    sp_2048_sub_98(z1, z1, z2);
    sp_2048_sub_98(z1, z1, z0);
    sp_2048_add_98(r + 49, r + 49, z1);
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_add_98(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 98; i++)
        r[i] = a[i] + b[i];

    return 0;
}
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_sub_98(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 98; i++)
        r[i] = a[i] - b[i];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_98(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j, k;
    int64_t c;

    c = ((int64_t)a[97]) * b[97];
    r[195] = (sp_digit)(c >> 21);
    c = (c & 0x1fffff) << 21;
    for (k = 193; k >= 0; k--) {
        for (i = 97; i >= 0; i--) {
            j = k - i;
            if (j >= 98)
                break;
            if (j < 0)
                continue;

            c += ((int64_t)a[i]) * b[j];
        }
        r[k + 2] += c >> 42;
        r[k + 1] = (c >> 21) & 0x1fffff;
        c = (c & 0x1fffff) << 21;
    }
    r[0] = (sp_digit)(c >> 21);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_98(sp_digit* r, const sp_digit* a)
{
    int i, j, k;
    int64_t c;

    c = ((int64_t)a[97]) * a[97];
    r[195] = (sp_digit)(c >> 21);
    c = (c & 0x1fffff) << 21;
    for (k = 193; k >= 0; k--) {
        for (i = 97; i >= 0; i--) {
            j = k - i;
            if (j >= 98 || i <= j)
                break;
            if (j < 0)
                continue;

            c += ((int64_t)a[i]) * a[j] * 2;
        }
        if (i == j)
           c += ((int64_t)a[i]) * a[i];

        r[k + 2] += c >> 42;
        r[k + 1] = (c >> 21) & 0x1fffff;
        c = (c & 0x1fffff) << 21;
    }
    r[0] = (sp_digit)(c >> 21);
}

#endif /* WOLFSSL_SP_SMALL */
#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_add_49(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 49; i++)
        r[i] = a[i] + b[i];

    return 0;
}
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_sub_49(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 49; i++)
        r[i] = a[i] - b[i];

    return 0;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_sub_49(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[48] = a[48] - b[48];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_49(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j, k;
    int64_t c;

    c = ((int64_t)a[48]) * b[48];
    r[97] = (sp_digit)(c >> 21);
    c = (c & 0x1fffff) << 21;
    for (k = 95; k >= 0; k--) {
        for (i = 48; i >= 0; i--) {
            j = k - i;
            if (j >= 49)
                break;
            if (j < 0)
                continue;

            c += ((int64_t)a[i]) * b[j];
        }
        r[k + 2] += c >> 42;
        r[k + 1] = (c >> 21) & 0x1fffff;
        c = (c & 0x1fffff) << 21;
    }
    r[0] = (sp_digit)(c >> 21);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_49(sp_digit* r, const sp_digit* a)
{
    int i, j, k;
    int64_t c;

    c = ((int64_t)a[48]) * a[48];
    r[97] = (sp_digit)(c >> 21);
    c = (c & 0x1fffff) << 21;
    for (k = 95; k >= 0; k--) {
        for (i = 48; i >= 0; i--) {
            j = k - i;
            if (j >= 49 || i <= j)
                break;
            if (j < 0)
                continue;

            c += ((int64_t)a[i]) * a[j] * 2;
        }
        if (i == j)
           c += ((int64_t)a[i]) * a[i];

        r[k + 2] += c >> 42;
        r[k + 1] = (c >> 21) & 0x1fffff;
        c = (c & 0x1fffff) << 21;
    }
    r[0] = (sp_digit)(c >> 21);
}

#endif /* WOLFSSL_SP_SMALL */
#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* Caclulate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_2048_mont_setup(sp_digit* a, sp_digit* rho)
{
    sp_digit x, b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x &= 0x1fffff;

    /* rho = -1/m mod b */
    *rho = (1L << 21) - x;
}

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_2048_mont_norm_49(sp_digit* r, sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<48; i++)
        r[i] = 0x1fffff;
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = 0x1fffff;
        r[i + 1] = 0x1fffff;
        r[i + 2] = 0x1fffff;
        r[i + 3] = 0x1fffff;
        r[i + 4] = 0x1fffff;
        r[i + 5] = 0x1fffff;
        r[i + 6] = 0x1fffff;
        r[i + 7] = 0x1fffff;
    }
#endif
    r[48] = 0xffffl;

    /* r = (2^n - 1) mod n */
    sp_2048_sub_49(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_2048_cmp_49(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=48; i>=0; i--)
        r |= (a[i] - b[i]) & (0 - !r);
#else
    int i;

    r |= (a[48] - b[48]) & (0 - !r);
    for (i = 40; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & (0 - !r);
        r |= (a[i + 6] - b[i + 6]) & (0 - !r);
        r |= (a[i + 5] - b[i + 5]) & (0 - !r);
        r |= (a[i + 4] - b[i + 4]) & (0 - !r);
        r |= (a[i + 3] - b[i + 3]) & (0 - !r);
        r |= (a[i + 2] - b[i + 2]) & (0 - !r);
        r |= (a[i + 1] - b[i + 1]) & (0 - !r);
        r |= (a[i + 0] - b[i + 0]) & (0 - !r);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_2048_cond_sub_49(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 49; i++)
        r[i] = a[i] - (b[i] & m);
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[48] = a[48] - (b[48] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_2048_mul_add_49(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int64_t tb = b;
    int64_t t = 0;
    int i;

    for (i = 0; i < 49; i++) {
        t += (tb * a[i]) + r[i];
        r[i] = t & 0x1fffff;
        t >>= 21;
    }
    r[49] += t;
#else
    int64_t tb = b;
    int64_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] += t[0] & 0x1fffff;
    for (i = 0; i < 48; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] += (t[0] >> 21) + (t[1] & 0x1fffff);
        t[2] = tb * a[i+2];
        r[i+2] += (t[1] >> 21) + (t[2] & 0x1fffff);
        t[3] = tb * a[i+3];
        r[i+3] += (t[2] >> 21) + (t[3] & 0x1fffff);
        t[4] = tb * a[i+4];
        r[i+4] += (t[3] >> 21) + (t[4] & 0x1fffff);
        t[5] = tb * a[i+5];
        r[i+5] += (t[4] >> 21) + (t[5] & 0x1fffff);
        t[6] = tb * a[i+6];
        r[i+6] += (t[5] >> 21) + (t[6] & 0x1fffff);
        t[7] = tb * a[i+7];
        r[i+7] += (t[6] >> 21) + (t[7] & 0x1fffff);
        t[0] = tb * a[i+8];
        r[i+8] += (t[7] >> 21) + (t[0] & 0x1fffff);
    }
    r[49] +=  t[0] >> 21;
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 21.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_49(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 48; i++) {
        a[i+1] += a[i] >> 21;
        a[i] &= 0x1fffff;
    }
#else
    int i;
    for (i = 0; i < 48; i += 8) {
        a[i+1] += a[i+0] >> 21; a[i+0] &= 0x1fffff;
        a[i+2] += a[i+1] >> 21; a[i+1] &= 0x1fffff;
        a[i+3] += a[i+2] >> 21; a[i+2] &= 0x1fffff;
        a[i+4] += a[i+3] >> 21; a[i+3] &= 0x1fffff;
        a[i+5] += a[i+4] >> 21; a[i+4] &= 0x1fffff;
        a[i+6] += a[i+5] >> 21; a[i+5] &= 0x1fffff;
        a[i+7] += a[i+6] >> 21; a[i+6] &= 0x1fffff;
        a[i+8] += a[i+7] >> 21; a[i+7] &= 0x1fffff;
        a[i+9] += a[i+8] >> 21; a[i+8] &= 0x1fffff;
    }
#endif
}

/* Shift the result in the high 1024 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_2048_mont_shift_49(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    word32 n;

    n = a[48] >> 16;
    for (i = 0; i < 48; i++) {
        n += a[49 + i] << 5;
        r[i] = n & 0x1fffff;
        n >>= 21;
    }
    n += a[97] << 5;
    r[48] = n;
#else
    word32 n;
    int i;

    n  = a[48] >> 16;
    for (i = 0; i < 48; i += 8) {
        n += a[i+49] << 5; r[i+0] = n & 0x1fffff; n >>= 21;
        n += a[i+50] << 5; r[i+1] = n & 0x1fffff; n >>= 21;
        n += a[i+51] << 5; r[i+2] = n & 0x1fffff; n >>= 21;
        n += a[i+52] << 5; r[i+3] = n & 0x1fffff; n >>= 21;
        n += a[i+53] << 5; r[i+4] = n & 0x1fffff; n >>= 21;
        n += a[i+54] << 5; r[i+5] = n & 0x1fffff; n >>= 21;
        n += a[i+55] << 5; r[i+6] = n & 0x1fffff; n >>= 21;
        n += a[i+56] << 5; r[i+7] = n & 0x1fffff; n >>= 21;
    }
    n += a[97] << 5; r[48] = n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[49], 0, sizeof(*r) * 49);
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_2048_mont_reduce_49(sp_digit* a, sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;

    for (i=0; i<48; i++) {
        mu = (a[i] * mp) & 0x1fffff;
        sp_2048_mul_add_49(a+i, m, mu);
        a[i+1] += a[i] >> 21;
    }
    mu = (a[i] * mp) & 0xffffl;
    sp_2048_mul_add_49(a+i, m, mu);
    a[i+1] += a[i] >> 21;
    a[i] &= 0x1fffff;

    sp_2048_mont_shift_49(a, a);
    sp_2048_cond_sub_49(a, a, m, 0 - ((a[48] >> 16) > 0));
    sp_2048_norm_49(a);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_49(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_49(r, a, b);
    sp_2048_mont_reduce_49(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_49(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_49(r, a);
    sp_2048_mont_reduce_49(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_2048_mul_d_49(sp_digit* r, const sp_digit* a, const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int64_t tb = b;
    int64_t t = 0;
    int i;

    for (i = 0; i < 49; i++) {
        t += tb * a[i];
        r[i] = t & 0x1fffff;
        t >>= 21;
    }
    r[49] = (sp_digit)t;
#else
    int64_t tb = b;
    int64_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] = t[0] & 0x1fffff;
    for (i = 0; i < 48; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff);
        t[2] = tb * a[i+2];
        r[i+2] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff);
        t[3] = tb * a[i+3];
        r[i+3] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff);
        t[4] = tb * a[i+4];
        r[i+4] = (sp_digit)(t[3] >> 21) + (t[4] & 0x1fffff);
        t[5] = tb * a[i+5];
        r[i+5] = (sp_digit)(t[4] >> 21) + (t[5] & 0x1fffff);
        t[6] = tb * a[i+6];
        r[i+6] = (sp_digit)(t[5] >> 21) + (t[6] & 0x1fffff);
        t[7] = tb * a[i+7];
        r[i+7] = (sp_digit)(t[6] >> 21) + (t[7] & 0x1fffff);
        t[0] = tb * a[i+8];
        r[i+8] = (sp_digit)(t[7] >> 21) + (t[0] & 0x1fffff);
    }
    r[49] =  (sp_digit)(t[0] >> 21);
#endif /* WOLFSSL_SP_SMALL */
}

/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_2048_cond_add_49(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 49; i++)
        r[i] = a[i] + (b[i] & m);
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[48] = a[48] + (b[48] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_div_49(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    int i;
    int64_t d1;
    sp_digit div, r1;
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* td;
#else
    sp_digit t1d[98], t2d[98];
#endif
    sp_digit* t1;
    sp_digit* t2;
    int err = MP_OKAY;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    td = XMALLOC(sizeof(sp_digit) * 4 * 49, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    if (td != NULL) {
        t1 = td;
        t2 = td + 2 * 49;
    }
    else
        err = MEMORY_E;
#else
    t1 = t1d;
    t2 = t2d;
#endif

    (void)m;

    if (err == MP_OKAY) {
        div = d[48];
        XMEMCPY(t1, a, sizeof(*t1) * 2 * 49);
        for (i=48; i>=0; i--) {
            t1[49 + i] += t1[49 + i - 1] >> 21;
            t1[49 + i - 1] &= 0x1fffff;
            d1 = t1[49 + i];
            d1 <<= 21;
            d1 += t1[49 + i - 1];
            r1 = (sp_digit)(d1 / div);

            sp_2048_mul_d_49(t2, d, r1);
            sp_2048_sub_49(&t1[i], &t1[i], t2);
            t1[49 + i] -= t2[49];
            t1[49 + i] += t1[49 + i - 1] >> 21;
            t1[49 + i - 1] &= 0x1fffff;
            r1 = (((-t1[49 + i]) << 21) - t1[49 + i - 1]) / div;
            r1++;
            sp_2048_mul_d_49(t2, d, r1);
            sp_2048_add_49(&t1[i], &t1[i], t2);
            t1[49 + i] += t1[49 + i - 1] >> 21;
            t1[49 + i - 1] &= 0x1fffff;
        }
        t1[49 - 1] += t1[49 - 2] >> 21;
        t1[49 - 2] &= 0x1fffff;
        d1 = t1[49 - 1];
        r1 = (sp_digit)(d1 / div);

        sp_2048_mul_d_49(t2, d, r1);
        sp_2048_sub_49(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 2 * 49);
        for (i=0; i<47; i++) {
            r[i+1] += r[i] >> 21;
            r[i] &= 0x1fffff;
        }
        sp_2048_cond_add_49(r, r, d, 0 - (r[48] < 0));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_mod_49(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_49(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_49(sp_digit* r, sp_digit* a, sp_digit* e, int bits,
    sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* td;
    sp_digit* t[3];
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 49 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        XMEMSET(td, 0, sizeof(*td) * 3 * 49 * 2);

        norm = t[0] = td;
        t[1] = &td[49 * 2];
        t[2] = &td[2 * 49 * 2];

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_49(norm, m);

        if (reduceA)
            err = sp_2048_mod_49(t[1], a, m);
        else
            XMEMCPY(t[1], a, sizeof(sp_digit) * 49);
    }
    if (err == MP_OKAY) {
        sp_2048_mul_49(t[1], t[1], norm);
        err = sp_2048_mod_49(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 21;
        c = bits % 21;
        n = e[i--] << (21 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 21;
            }

            y = (n >> 20) & 1;
            n <<= 1;

            sp_2048_mont_mul_49(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                    sizeof(*t[2]) * 49 * 2);
            sp_2048_mont_sqr_49(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                    sizeof(*t[2]) * 49 * 2);
        }

        sp_2048_mont_reduce_49(t[0], m, mp);
        n = sp_2048_cmp_49(t[0], m);
        sp_2048_cond_sub_49(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(*r) * 49 * 2);

    }

    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#elif defined(WOLFSSL_SP_CACHE_RESISTANT)
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[3][98];
#else
    sp_digit* td;
    sp_digit* t[3];
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 49 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        t[0] = td;
        t[1] = &td[49 * 2];
        t[2] = &td[2 * 49 * 2];
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_49(norm, m);

        if (reduceA) {
            err = sp_2048_mod_49(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_49(t[1], t[1], norm);
                err = sp_2048_mod_49(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_49(t[1], a, norm);
            err = sp_2048_mod_49(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 21;
        c = bits % 21;
        n = e[i--] << (21 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 21;
            }

            y = (n >> 20) & 1;
            n <<= 1;

            sp_2048_mont_mul_49(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                 ((size_t)t[1] & addr_mask[y])), sizeof(t[2]));
            sp_2048_mont_sqr_49(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                           ((size_t)t[1] & addr_mask[y])), t[2], sizeof(t[2]));
        }

        sp_2048_mont_reduce_49(t[0], m, mp);
        n = sp_2048_cmp_49(t[0], m);
        sp_2048_cond_sub_49(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(t[0]));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][98];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit rt[98];
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 98, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 98;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_49(norm, m);

        if (reduceA) {
            err = sp_2048_mod_49(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_49(t[1], t[1], norm);
                err = sp_2048_mod_49(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_49(t[1], a, norm);
            err = sp_2048_mod_49(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_49(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_49(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_49(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_49(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_49(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_49(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_49(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_49(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_49(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_49(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_49(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_49(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_49(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_49(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_49(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_49(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_49(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_49(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_49(t[20], t[10], m, mp);
        sp_2048_mont_mul_49(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_49(t[22], t[11], m, mp);
        sp_2048_mont_mul_49(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_49(t[24], t[12], m, mp);
        sp_2048_mont_mul_49(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_49(t[26], t[13], m, mp);
        sp_2048_mont_mul_49(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_49(t[28], t[14], m, mp);
        sp_2048_mont_mul_49(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_49(t[30], t[15], m, mp);
        sp_2048_mont_mul_49(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 20) / 21) - 1;
        c = bits % 21;
        if (c == 0)
            c = 21;
        if (i < 49)
            n = e[i--] << (32 - c);
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (11 - c);
            c += 21;
        }
        y = n >> 27;
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(rt));
        for (; i>=0 || c>=5; ) {
            if (c < 5) {
                n |= e[i--] << (11 - c);
                c += 21;
            }
            y = (n >> 27) & 0x1f;
            n <<= 5;
            c -= 5;

            sp_2048_mont_sqr_49(rt, rt, m, mp);
            sp_2048_mont_sqr_49(rt, rt, m, mp);
            sp_2048_mont_sqr_49(rt, rt, m, mp);
            sp_2048_mont_sqr_49(rt, rt, m, mp);
            sp_2048_mont_sqr_49(rt, rt, m, mp);

            sp_2048_mont_mul_49(rt, rt, t[y], m, mp);
        }

        sp_2048_mont_reduce_49(rt, m, mp);
        n = sp_2048_cmp_49(rt, m);
        sp_2048_cond_sub_49(rt, rt, m, (n < 0) - 1);
        XMEMCPY(r, rt, sizeof(rt));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_2048_mont_norm_98(sp_digit* r, sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<97; i++)
        r[i] = 0x1fffff;
#else
    int i;

    for (i = 0; i < 96; i += 8) {
        r[i + 0] = 0x1fffff;
        r[i + 1] = 0x1fffff;
        r[i + 2] = 0x1fffff;
        r[i + 3] = 0x1fffff;
        r[i + 4] = 0x1fffff;
        r[i + 5] = 0x1fffff;
        r[i + 6] = 0x1fffff;
        r[i + 7] = 0x1fffff;
    }
    r[96] = 0x1fffff;
#endif
    r[97] = 0x7ffl;

    /* r = (2^n - 1) mod n */
    sp_2048_sub_98(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_2048_cmp_98(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=97; i>=0; i--)
        r |= (a[i] - b[i]) & (0 - !r);
#else
    int i;

    r |= (a[97] - b[97]) & (0 - !r);
    r |= (a[96] - b[96]) & (0 - !r);
    for (i = 88; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & (0 - !r);
        r |= (a[i + 6] - b[i + 6]) & (0 - !r);
        r |= (a[i + 5] - b[i + 5]) & (0 - !r);
        r |= (a[i + 4] - b[i + 4]) & (0 - !r);
        r |= (a[i + 3] - b[i + 3]) & (0 - !r);
        r |= (a[i + 2] - b[i + 2]) & (0 - !r);
        r |= (a[i + 1] - b[i + 1]) & (0 - !r);
        r |= (a[i + 0] - b[i + 0]) & (0 - !r);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_2048_cond_sub_98(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 98; i++)
        r[i] = a[i] - (b[i] & m);
#else
    int i;

    for (i = 0; i < 96; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[96] = a[96] - (b[96] & m);
    r[97] = a[97] - (b[97] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_2048_mul_add_98(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int64_t tb = b;
    int64_t t = 0;
    int i;

    for (i = 0; i < 98; i++) {
        t += (tb * a[i]) + r[i];
        r[i] = t & 0x1fffff;
        t >>= 21;
    }
    r[98] += t;
#else
    int64_t tb = b;
    int64_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] += t[0] & 0x1fffff;
    for (i = 0; i < 96; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] += (t[0] >> 21) + (t[1] & 0x1fffff);
        t[2] = tb * a[i+2];
        r[i+2] += (t[1] >> 21) + (t[2] & 0x1fffff);
        t[3] = tb * a[i+3];
        r[i+3] += (t[2] >> 21) + (t[3] & 0x1fffff);
        t[4] = tb * a[i+4];
        r[i+4] += (t[3] >> 21) + (t[4] & 0x1fffff);
        t[5] = tb * a[i+5];
        r[i+5] += (t[4] >> 21) + (t[5] & 0x1fffff);
        t[6] = tb * a[i+6];
        r[i+6] += (t[5] >> 21) + (t[6] & 0x1fffff);
        t[7] = tb * a[i+7];
        r[i+7] += (t[6] >> 21) + (t[7] & 0x1fffff);
        t[0] = tb * a[i+8];
        r[i+8] += (t[7] >> 21) + (t[0] & 0x1fffff);
    }
    t[1] = tb * a[97]; r[97] += (t[0] >> 21) + (t[1] & 0x1fffff);
    r[98] +=  t[1] >> 21;
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 21.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_98(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 97; i++) {
        a[i+1] += a[i] >> 21;
        a[i] &= 0x1fffff;
    }
#else
    int i;
    for (i = 0; i < 96; i += 8) {
        a[i+1] += a[i+0] >> 21; a[i+0] &= 0x1fffff;
        a[i+2] += a[i+1] >> 21; a[i+1] &= 0x1fffff;
        a[i+3] += a[i+2] >> 21; a[i+2] &= 0x1fffff;
        a[i+4] += a[i+3] >> 21; a[i+3] &= 0x1fffff;
        a[i+5] += a[i+4] >> 21; a[i+4] &= 0x1fffff;
        a[i+6] += a[i+5] >> 21; a[i+5] &= 0x1fffff;
        a[i+7] += a[i+6] >> 21; a[i+6] &= 0x1fffff;
        a[i+8] += a[i+7] >> 21; a[i+7] &= 0x1fffff;
        a[i+9] += a[i+8] >> 21; a[i+8] &= 0x1fffff;
    }
    a[96+1] += a[96] >> 21;
    a[96] &= 0x1fffff;
#endif
}

/* Shift the result in the high 2048 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_2048_mont_shift_98(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    int64_t n = a[97] >> 11;
    n += ((int64_t)a[98]) << 10;

    for (i = 0; i < 97; i++) {
        r[i] = n & 0x1fffff;
        n >>= 21;
        n += ((int64_t)a[99 + i]) << 10;
    }
    r[97] = (sp_digit)n;
#else
    int i;
    int64_t n = a[97] >> 11;
    n += ((int64_t)a[98]) << 10;
    for (i = 0; i < 96; i += 8) {
        r[i + 0] = n & 0x1fffff;
        n >>= 21; n += ((int64_t)a[i + 99]) << 10;
        r[i + 1] = n & 0x1fffff;
        n >>= 21; n += ((int64_t)a[i + 100]) << 10;
        r[i + 2] = n & 0x1fffff;
        n >>= 21; n += ((int64_t)a[i + 101]) << 10;
        r[i + 3] = n & 0x1fffff;
        n >>= 21; n += ((int64_t)a[i + 102]) << 10;
        r[i + 4] = n & 0x1fffff;
        n >>= 21; n += ((int64_t)a[i + 103]) << 10;
        r[i + 5] = n & 0x1fffff;
        n >>= 21; n += ((int64_t)a[i + 104]) << 10;
        r[i + 6] = n & 0x1fffff;
        n >>= 21; n += ((int64_t)a[i + 105]) << 10;
        r[i + 7] = n & 0x1fffff;
        n >>= 21; n += ((int64_t)a[i + 106]) << 10;
    }
    r[96] = n & 0x1fffff; n >>= 21; n += ((int64_t)a[195]) << 10;
    r[97] = (sp_digit)n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[98], 0, sizeof(*r) * 98);
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_2048_mont_reduce_98(sp_digit* a, sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;

    if (mp != 1) {
        for (i=0; i<97; i++) {
            mu = (a[i] * mp) & 0x1fffff;
            sp_2048_mul_add_98(a+i, m, mu);
            a[i+1] += a[i] >> 21;
        }
        mu = (a[i] * mp) & 0x7ffl;
        sp_2048_mul_add_98(a+i, m, mu);
        a[i+1] += a[i] >> 21;
        a[i] &= 0x1fffff;
    }
    else {
        for (i=0; i<97; i++) {
            mu = a[i] & 0x1fffff;
            sp_2048_mul_add_98(a+i, m, mu);
            a[i+1] += a[i] >> 21;
        }
        mu = a[i] & 0x7ffl;
        sp_2048_mul_add_98(a+i, m, mu);
        a[i+1] += a[i] >> 21;
        a[i] &= 0x1fffff;
    }

    sp_2048_mont_shift_98(a, a);
    sp_2048_cond_sub_98(a, a, m, 0 - ((a[97] >> 11) > 0));
    sp_2048_norm_98(a);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_98(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_98(r, a, b);
    sp_2048_mont_reduce_98(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_98(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_98(r, a);
    sp_2048_mont_reduce_98(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_2048_mul_d_98(sp_digit* r, const sp_digit* a, const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int64_t tb = b;
    int64_t t = 0;
    int i;

    for (i = 0; i < 98; i++) {
        t += tb * a[i];
        r[i] = t & 0x1fffff;
        t >>= 21;
    }
    r[98] = (sp_digit)t;
#else
    int64_t tb = b;
    int64_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] = t[0] & 0x1fffff;
    for (i = 0; i < 96; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff);
        t[2] = tb * a[i+2];
        r[i+2] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff);
        t[3] = tb * a[i+3];
        r[i+3] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff);
        t[4] = tb * a[i+4];
        r[i+4] = (sp_digit)(t[3] >> 21) + (t[4] & 0x1fffff);
        t[5] = tb * a[i+5];
        r[i+5] = (sp_digit)(t[4] >> 21) + (t[5] & 0x1fffff);
        t[6] = tb * a[i+6];
        r[i+6] = (sp_digit)(t[5] >> 21) + (t[6] & 0x1fffff);
        t[7] = tb * a[i+7];
        r[i+7] = (sp_digit)(t[6] >> 21) + (t[7] & 0x1fffff);
        t[0] = tb * a[i+8];
        r[i+8] = (sp_digit)(t[7] >> 21) + (t[0] & 0x1fffff);
    }
    t[1] = tb * a[97];
    r[97] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff);
    r[98] =  (sp_digit)(t[1] >> 21);
#endif /* WOLFSSL_SP_SMALL */
}

/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_2048_cond_add_98(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 98; i++)
        r[i] = a[i] + (b[i] & m);
#else
    int i;

    for (i = 0; i < 96; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[96] = a[96] + (b[96] & m);
    r[97] = a[97] + (b[97] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_div_98(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    int i;
    int64_t d1;
    sp_digit div, r1;
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* td;
#else
    sp_digit t1d[196], t2d[196];
#endif
    sp_digit* t1;
    sp_digit* t2;
    int err = MP_OKAY;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    td = XMALLOC(sizeof(sp_digit) * 4 * 98, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    if (td != NULL) {
        t1 = td;
        t2 = td + 2 * 98;
    }
    else
        err = MEMORY_E;
#else
    t1 = t1d;
    t2 = t2d;
#endif

    (void)m;

    if (err == MP_OKAY) {
        div = d[97];
        XMEMCPY(t1, a, sizeof(*t1) * 2 * 98);
        for (i=97; i>=0; i--) {
            t1[98 + i] += t1[98 + i - 1] >> 21;
            t1[98 + i - 1] &= 0x1fffff;
            d1 = t1[98 + i];
            d1 <<= 21;
            d1 += t1[98 + i - 1];
            r1 = (sp_digit)(d1 / div);

            sp_2048_mul_d_98(t2, d, r1);
            sp_2048_sub_98(&t1[i], &t1[i], t2);
            t1[98 + i] -= t2[98];
            t1[98 + i] += t1[98 + i - 1] >> 21;
            t1[98 + i - 1] &= 0x1fffff;
            r1 = (((-t1[98 + i]) << 21) - t1[98 + i - 1]) / div;
            r1++;
            sp_2048_mul_d_98(t2, d, r1);
            sp_2048_add_98(&t1[i], &t1[i], t2);
            t1[98 + i] += t1[98 + i - 1] >> 21;
            t1[98 + i - 1] &= 0x1fffff;
        }
        t1[98 - 1] += t1[98 - 2] >> 21;
        t1[98 - 2] &= 0x1fffff;
        d1 = t1[98 - 1];
        r1 = (sp_digit)(d1 / div);

        sp_2048_mul_d_98(t2, d, r1);
        sp_2048_sub_98(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 2 * 98);
        for (i=0; i<96; i++) {
            r[i+1] += r[i] >> 21;
            r[i] &= 0x1fffff;
        }
        sp_2048_cond_add_98(r, r, d, 0 - (r[97] < 0));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_mod_98(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_98(a, m, NULL, r);
}

#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_98(sp_digit* r, sp_digit* a, sp_digit* e, int bits,
    sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* td;
    sp_digit* t[3];
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 98 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        XMEMSET(td, 0, sizeof(*td) * 3 * 98 * 2);

        norm = t[0] = td;
        t[1] = &td[98 * 2];
        t[2] = &td[2 * 98 * 2];

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_98(norm, m);

        if (reduceA)
            err = sp_2048_mod_98(t[1], a, m);
        else
            XMEMCPY(t[1], a, sizeof(sp_digit) * 98);
    }
    if (err == MP_OKAY) {
        sp_2048_mul_98(t[1], t[1], norm);
        err = sp_2048_mod_98(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 21;
        c = bits % 21;
        n = e[i--] << (21 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 21;
            }

            y = (n >> 20) & 1;
            n <<= 1;

            sp_2048_mont_mul_98(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                    sizeof(*t[2]) * 98 * 2);
            sp_2048_mont_sqr_98(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                    sizeof(*t[2]) * 98 * 2);
        }

        sp_2048_mont_reduce_98(t[0], m, mp);
        n = sp_2048_cmp_98(t[0], m);
        sp_2048_cond_sub_98(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(*r) * 98 * 2);

    }

    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#elif defined(WOLFSSL_SP_CACHE_RESISTANT)
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[3][196];
#else
    sp_digit* td;
    sp_digit* t[3];
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 98 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        t[0] = td;
        t[1] = &td[98 * 2];
        t[2] = &td[2 * 98 * 2];
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_98(norm, m);

        if (reduceA) {
            err = sp_2048_mod_98(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_98(t[1], t[1], norm);
                err = sp_2048_mod_98(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_98(t[1], a, norm);
            err = sp_2048_mod_98(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 21;
        c = bits % 21;
        n = e[i--] << (21 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 21;
            }

            y = (n >> 20) & 1;
            n <<= 1;

            sp_2048_mont_mul_98(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                 ((size_t)t[1] & addr_mask[y])), sizeof(t[2]));
            sp_2048_mont_sqr_98(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                           ((size_t)t[1] & addr_mask[y])), t[2], sizeof(t[2]));
        }

        sp_2048_mont_reduce_98(t[0], m, mp);
        n = sp_2048_cmp_98(t[0], m);
        sp_2048_cond_sub_98(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(t[0]));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][196];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit rt[196];
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 196, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 196;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_98(norm, m);

        if (reduceA) {
            err = sp_2048_mod_98(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_98(t[1], t[1], norm);
                err = sp_2048_mod_98(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_98(t[1], a, norm);
            err = sp_2048_mod_98(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_98(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_98(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_98(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_98(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_98(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_98(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_98(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_98(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_98(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_98(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_98(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_98(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_98(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_98(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_98(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_98(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_98(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_98(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_98(t[20], t[10], m, mp);
        sp_2048_mont_mul_98(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_98(t[22], t[11], m, mp);
        sp_2048_mont_mul_98(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_98(t[24], t[12], m, mp);
        sp_2048_mont_mul_98(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_98(t[26], t[13], m, mp);
        sp_2048_mont_mul_98(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_98(t[28], t[14], m, mp);
        sp_2048_mont_mul_98(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_98(t[30], t[15], m, mp);
        sp_2048_mont_mul_98(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 20) / 21) - 1;
        c = bits % 21;
        if (c == 0)
            c = 21;
        if (i < 98)
            n = e[i--] << (32 - c);
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (11 - c);
            c += 21;
        }
        y = n >> 27;
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(rt));
        for (; i>=0 || c>=5; ) {
            if (c < 5) {
                n |= e[i--] << (11 - c);
                c += 21;
            }
            y = (n >> 27) & 0x1f;
            n <<= 5;
            c -= 5;

            sp_2048_mont_sqr_98(rt, rt, m, mp);
            sp_2048_mont_sqr_98(rt, rt, m, mp);
            sp_2048_mont_sqr_98(rt, rt, m, mp);
            sp_2048_mont_sqr_98(rt, rt, m, mp);
            sp_2048_mont_sqr_98(rt, rt, m, mp);

            sp_2048_mont_mul_98(rt, rt, t[y], m, mp);
        }

        sp_2048_mont_reduce_98(rt, m, mp);
        n = sp_2048_cmp_98(rt, m);
        sp_2048_cond_sub_98(rt, rt, m, (n < 0) - 1);
        XMEMCPY(r, rt, sizeof(rt));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_49(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<49; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
    r[48] = a[48] & m;
#endif
}

#endif
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_2048(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
    byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* d;
    sp_digit* a;
    sp_digit* m;
    sp_digit* r;
    sp_digit* norm;
    sp_digit e[1];
    sp_digit mp;
    int i;
    int err = MP_OKAY;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 21 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 98 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 98 * 2;
        m = r + 98 * 2;
        norm = r;

        sp_2048_from_bin(a, 98, in, inLen);
#if DIGIT_BIT >= 21
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 98, mm);

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_98(norm, m);
    }
    if (err == MP_OKAY) {
        sp_2048_mul_98(a, a, norm);
        err = sp_2048_mod_98(a, a, m);
    }
    if (err == MP_OKAY) {
        for (i=20; i>=0; i--)
            if (e[0] >> i)
                break;

        XMEMCPY(r, a, sizeof(sp_digit) * 98 * 2);
        for (i--; i>=0; i--) {
            sp_2048_mont_sqr_98(r, r, m, mp);

            if (((e[0] >> i) & 1) == 1)
                sp_2048_mont_mul_98(r, r, a, m, mp);
        }
        sp_2048_mont_reduce_98(r, m, mp);
        mp = sp_2048_cmp_98(r, m);
        sp_2048_cond_sub_98(r, r, m, (mp < 0) - 1);

        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#else
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[196], md[98], rd[196];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* a;
    sp_digit* m;
    sp_digit* r;
    sp_digit e[1];
    int err = MP_OKAY;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 21 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 98 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 98 * 2;
        m = r + 98 * 2;
    }
#else
    a = ad;
    m = md;
    r = rd;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_bin(a, 98, in, inLen);
#if DIGIT_BIT >= 21
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 98, mm);

        if (e[0] == 0x3) {
            if (err == MP_OKAY) {
                sp_2048_sqr_98(r, a);
                err = sp_2048_mod_98(r, r, m);
            }
            if (err == MP_OKAY) {
                sp_2048_mul_98(r, a, r);
                err = sp_2048_mod_98(r, r, m);
            }
        }
        else {
            sp_digit* norm = r;
            int i;
            sp_digit mp;

            sp_2048_mont_setup(m, &mp);
            sp_2048_mont_norm_98(norm, m);

            if (err == MP_OKAY) {
                sp_2048_mul_98(a, a, norm);
                err = sp_2048_mod_98(a, a, m);
            }

            if (err == MP_OKAY) {
                for (i=20; i>=0; i--)
                    if (e[0] >> i)
                        break;

                XMEMCPY(r, a, sizeof(sp_digit) * 196);
                for (i--; i>=0; i--) {
                    sp_2048_mont_sqr_98(r, r, m, mp);

                    if (((e[0] >> i) & 1) == 1)
                        sp_2048_mont_mul_98(r, r, a, m, mp);
                }
                sp_2048_mont_reduce_98(r, m, mp);
                mp = sp_2048_cmp_98(r, m);
                sp_2048_cond_sub_98(r, r, m, (mp < 0) - 1);
            }
        }
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif /* WOLFSSL_SP_SMALL */
}

/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_2048(const byte* in, word32 inLen, mp_int* dm,
    mp_int* pm, mp_int* qm, mp_int* dpm, mp_int* dqm, mp_int* qim, mp_int* mm,
    byte* out, word32* outLen)
{
#ifdef SP_RSA_PRIVATE_EXP_D
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* a;
    sp_digit* d = NULL;
    sp_digit* m;
    sp_digit* r;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(dm) > 2048 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 98 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = d + 98;
        m = a + 98;
        r = a;

        sp_2048_from_bin(a, 98, in, inLen);
        sp_2048_from_mp(d, 98, dm);
        sp_2048_from_mp(m, 98, mm);
        err = sp_2048_mod_exp_98(r, a, d, 2048, m, 0);
    }
    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    if (d != NULL) {
        XMEMSET(d, 0, sizeof(sp_digit) * 98);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }

    return err;
#else
    sp_digit a[196], d[98], m[98];
    sp_digit* r = a;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(dm) > 2048 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        sp_2048_from_bin(a, 98, in, inLen);
        sp_2048_from_mp(d, 98, dm);
        sp_2048_from_mp(m, 98, mm);
        err = sp_2048_mod_exp_98(r, a, d, 2048, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    XMEMSET(d, 0, sizeof(sp_digit) * 98);

    return err;
#endif /* WOLFSSL_SP_SMALL || defined(WOLFSSL_SMALL_STACK) */
#else
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* t = NULL;
    sp_digit* a;
    sp_digit* p;
    sp_digit* q;
    sp_digit* dp;
    sp_digit* dq;
    sp_digit* qi;
    sp_digit* tmp;
    sp_digit* tmpa;
    sp_digit* tmpb;
    sp_digit* r;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 256 || mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 49 * 11, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (t == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = t;
        p = a + 98 * 2;
        q = p + 49;
        qi = dq = dp = q + 49;
        tmpa = qi + 49;
        tmpb = tmpa + 98;

        tmp = t;
        r = tmp + 98;

        sp_2048_from_bin(a, 98, in, inLen);
        sp_2048_from_mp(p, 49, pm);
        sp_2048_from_mp(q, 49, qm);
        sp_2048_from_mp(dp, 49, dpm);
        err = sp_2048_mod_exp_49(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(dq, 49, dqm);
        err = sp_2048_mod_exp_49(tmpb, a, dq, 1024, q, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_sub_49(tmpa, tmpa, tmpb);
        sp_2048_mask_49(tmp, p, tmpa[48] >> 31);
        sp_2048_add_49(tmpa, tmpa, tmp);

        sp_2048_from_mp(qi, 49, qim);
        sp_2048_mul_49(tmpa, tmpa, qi);
        err = sp_2048_mod_49(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_2048_mul_49(tmpa, q, tmpa);
        sp_2048_add_98(r, tmpb, tmpa);
        sp_2048_norm_98(r);

        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    if (t != NULL) {
        XMEMSET(t, 0, sizeof(sp_digit) * 49 * 11);
        XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }

    return err;
#else
    sp_digit a[98 * 2];
    sp_digit p[49], q[49], dp[49], dq[49], qi[49];
    sp_digit tmp[98], tmpa[98], tmpb[98];
    sp_digit* r = a;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 256 || mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        sp_2048_from_bin(a, 98, in, inLen);
        sp_2048_from_mp(p, 49, pm);
        sp_2048_from_mp(q, 49, qm);
        sp_2048_from_mp(dp, 49, dpm);
        sp_2048_from_mp(dq, 49, dqm);
        sp_2048_from_mp(qi, 49, qim);

        err = sp_2048_mod_exp_49(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY)
        err = sp_2048_mod_exp_49(tmpb, a, dq, 1024, q, 1);

    if (err == MP_OKAY) {
        sp_2048_sub_49(tmpa, tmpa, tmpb);
        sp_2048_mask_49(tmp, p, tmpa[48] >> 31);
        sp_2048_add_49(tmpa, tmpa, tmp);
        sp_2048_mul_49(tmpa, tmpa, qi);
        err = sp_2048_mod_49(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_2048_mul_49(tmpa, tmpa, q);
        sp_2048_add_98(r, tmpb, tmpa);
        sp_2048_norm_98(r);

        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    XMEMSET(tmpa, 0, sizeof(tmpa));
    XMEMSET(tmpb, 0, sizeof(tmpb));
    XMEMSET(p, 0, sizeof(p));
    XMEMSET(q, 0, sizeof(q));
    XMEMSET(dp, 0, sizeof(dp));
    XMEMSET(dq, 0, sizeof(dq));
    XMEMSET(qi, 0, sizeof(qi));

    return err;
#endif /* WOLFSSL_SP_SMALL || defined(WOLFSSL_SMALL_STACK) */
#endif /* SP_RSA_PRIVATE_EXP_D */
}

#endif /* WOLFSSL_HAVE_SP_RSA */
#ifdef WOLFSSL_HAVE_SP_DH
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_2048_to_mp(sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (2048 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) {
#if DIGIT_BIT == 21
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 98);
        r->used = 98;
        mp_clamp(r);
#elif DIGIT_BIT < 21
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 98; i++) {
            r->dp[j] |= a[i] << s;
            r->dp[j] &= (1l << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = a[i] >> s;
            while (s + DIGIT_BIT <= 21) {
                s += DIGIT_BIT;
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
                r->dp[++j] = a[i] >> s;
            }
            s = 21 - s;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 98; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 21 >= DIGIT_BIT) {
    #if DIGIT_BIT < 32
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 21 - s;
            }
            else
                s += 21;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
    sp_digit* d = NULL;
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048 || expBits > 2048 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 98 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 98 * 2;
        m = e + 98;
        r = b;

        sp_2048_from_mp(b, 98, base);
        sp_2048_from_mp(e, 98, exp);
        sp_2048_from_mp(m, 98, mod);

        err = sp_2048_mod_exp_98(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }

    if (d != NULL) {
        XMEMSET(e, 0, sizeof(sp_digit) * 98);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit bd[196], ed[98], md[98];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048 || expBits > 2048 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

#ifdef WOLFSSL_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 98 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 98 * 2;
        m = e + 98;
        r = b;
    }
#else
    r = b = bd;
    e = ed;
    m = md;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 98, base);
        sp_2048_from_mp(e, 98, exp);
        sp_2048_from_mp(m, 98, mod);

        err = sp_2048_mod_exp_98(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(sp_digit) * 98);

#ifdef WOLFSSL_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 256 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen,
    mp_int* mod, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
    sp_digit* d = NULL;
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    word32 i;

    if (mp_count_bits(base) > 2048 || expLen > 256 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 98 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 98 * 2;
        m = e + 98;
        r = b;

        sp_2048_from_mp(b, 98, base);
        sp_2048_from_bin(e, 98, exp, expLen);
        sp_2048_from_mp(m, 98, mod);

        err = sp_2048_mod_exp_98(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
        for (i=0; i<256 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    if (d != NULL) {
        XMEMSET(e, 0, sizeof(sp_digit) * 98);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit bd[196], ed[98], md[98];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    word32 i;
    int err = MP_OKAY;

    if (mp_count_bits(base) > 2048 || expLen > 256 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

#ifdef WOLFSSL_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 98 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 98 * 2;
        m = e + 98;
        r = b;
    }
#else
    r = b = bd;
    e = ed;
    m = md;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 98, base);
        sp_2048_from_bin(e, 98, exp, expLen);
        sp_2048_from_mp(m, 98, mod);

        err = sp_2048_mod_exp_98(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
        for (i=0; i<256 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    XMEMSET(e, 0, sizeof(sp_digit) * 98);

#ifdef WOLFSSL_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_SP_NO_2048 */
#endif /* SP_WORD_SIZE == 32 */

#endif
#if !defined(WOLFSSL_X86_64_BUILD) || !defined(USE_INTEL_SPEEDUP)
#if SP_WORD_SIZE == 32
#ifndef WOLFSSL_SP_NO_3072
/* Read big endian unsigned byte aray into r.
 *
 * r  A single precision integer.
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_3072_from_bin(sp_digit* r, int max, const byte* a, int n)
{
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= ((sp_digit)a[i]) << s;
        if (s >= 14) {
            r[j] &= 0x3fffff;
            s = 22 - s;
            if (j + 1 >= max)
                break;
            r[++j] = a[i] >> s;
            s = 8 - s;
        }
        else
            s += 8;
    }

    for (j++; j < max; j++)
        r[j] = 0;
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * a  A multi-precision integer.
 */
static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a)
{
#if DIGIT_BIT == 22
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < max; j++)
        r[j] = 0;
#elif DIGIT_BIT > 22
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= a->dp[i] << s;
        r[j] &= 0x3fffff;
        s = 22 - s;
        if (j + 1 >= max)
            break;
        r[++j] = a->dp[i] >> s;
        while (s + 22 <= DIGIT_BIT) {
            s += 22;
            r[j] &= 0x3fffff;
            if (j + 1 >= max)
                break;
            if (s < DIGIT_BIT)
                r[++j] = a->dp[i] >> s;
            else
                r[++j] = 0;
        }
        s = DIGIT_BIT - s;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#else
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 22) {
            r[j] &= 0x3fffff;
            if (j + 1 >= max)
                break;
            s = 22 - s;
            r[++j] = a->dp[i] >> s;
            s = DIGIT_BIT - s;
        }
        else
            s += DIGIT_BIT;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#endif
}

/* Write r as big endian to byte aray.
 * Fixed length number of bytes written: 384
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_3072_to_bin(sp_digit* r, byte* a)
{
    int i, j, s = 0, b;

    for (i=0; i<139; i++) {
        r[i+1] += r[i] >> 22;
        r[i] &= 0x3fffff;
    }
    j = 3072 / 8 - 1;
    a[j] = 0;
    for (i=0; i<140 && j>=0; i++) {
        b = 0;
        a[j--] |= r[i] << s; b += 8 - s;
        if (j < 0)
            break;
        while (b < 22) {
            a[j--] = r[i] >> b; b += 8;
            if (j < 0)
                break;
        }
        if (j < 0)
            break;
        s = 8 - (b - 22);
        a[j] = 0;
        if (s != 0)
            j++;
    }
}

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_70(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j;
    int64_t t[140];

    XMEMSET(t, 0, sizeof(t));
    for (i=0; i<70; i++) {
        for (j=0; j<70; j++)
            t[i+j] += ((int64_t)a[i]) * b[j];
    }
    for (i=0; i<139; i++) {
        r[i] = t[i] & 0x3fffff;
        t[i+1] += t[i] >> 22;
    }
    r[139] = (sp_digit)t[139];
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_70(sp_digit* r, const sp_digit* a)
{
    int i, j;
    int64_t t[140];

    XMEMSET(t, 0, sizeof(t));
    for (i=0; i<70; i++) {
        for (j=0; j<i; j++)
            t[i+j] += (((int64_t)a[i]) * a[j]) * 2;
        t[i+i] += ((int64_t)a[i]) * a[i];
    }
    for (i=0; i<139; i++) {
        r[i] = t[i] & 0x3fffff;
        t[i+1] += t[i] >> 22;
    }
    r[139] = (sp_digit)t[139];
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_70(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 64; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[64] = a[64] + b[64];
    r[65] = a[65] + b[65];
    r[66] = a[66] + b[66];
    r[67] = a[67] + b[67];
    r[68] = a[68] + b[68];
    r[69] = a[69] + b[69];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_140(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 136; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[136] = a[136] + b[136];
    r[137] = a[137] + b[137];
    r[138] = a[138] + b[138];
    r[139] = a[139] + b[139];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_140(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 136; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[136] = a[136] - b[136];
    r[137] = a[137] - b[137];
    r[138] = a[138] - b[138];
    r[139] = a[139] - b[139];

    return 0;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_140(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[140];
    sp_digit* a1 = z1;
    sp_digit b1[70];
    sp_digit* z2 = r + 140;
    sp_3072_add_70(a1, a, &a[70]);
    sp_3072_add_70(b1, b, &b[70]);
    sp_3072_mul_70(z2, &a[70], &b[70]);
    sp_3072_mul_70(z0, a, b);
    sp_3072_mul_70(z1, a1, b1);
    sp_3072_sub_140(z1, z1, z2);
    sp_3072_sub_140(z1, z1, z0);
    sp_3072_add_140(r + 70, r + 70, z1);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_140(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[140];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 140;
    sp_3072_add_70(a1, a, &a[70]);
    sp_3072_sqr_70(z2, &a[70]);
    sp_3072_sqr_70(z0, a);
    sp_3072_sqr_70(z1, a1);
    sp_3072_sub_140(z1, z1, z2);
    sp_3072_sub_140(z1, z1, z0);
    sp_3072_add_140(r + 70, r + 70, z1);
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_140(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 140; i++)
        r[i] = a[i] + b[i];

    return 0;
}
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_140(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 140; i++)
        r[i] = a[i] - b[i];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_140(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j, k;
    int64_t c;

    c = ((int64_t)a[139]) * b[139];
    r[279] = (sp_digit)(c >> 22);
    c = (c & 0x3fffff) << 22;
    for (k = 277; k >= 0; k--) {
        for (i = 139; i >= 0; i--) {
            j = k - i;
            if (j >= 140)
                break;
            if (j < 0)
                continue;

            c += ((int64_t)a[i]) * b[j];
        }
        r[k + 2] += c >> 44;
        r[k + 1] = (c >> 22) & 0x3fffff;
        c = (c & 0x3fffff) << 22;
    }
    r[0] = (sp_digit)(c >> 22);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_140(sp_digit* r, const sp_digit* a)
{
    int i, j, k;
    int64_t c;

    c = ((int64_t)a[139]) * a[139];
    r[279] = (sp_digit)(c >> 22);
    c = (c & 0x3fffff) << 22;
    for (k = 277; k >= 0; k--) {
        for (i = 139; i >= 0; i--) {
            j = k - i;
            if (j >= 140 || i <= j)
                break;
            if (j < 0)
                continue;

            c += ((int64_t)a[i]) * a[j] * 2;
        }
        if (i == j)
           c += ((int64_t)a[i]) * a[i];

        r[k + 2] += c >> 44;
        r[k + 1] = (c >> 22) & 0x3fffff;
        c = (c & 0x3fffff) << 22;
    }
    r[0] = (sp_digit)(c >> 22);
}

#endif /* WOLFSSL_SP_SMALL */
#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_70(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 70; i++)
        r[i] = a[i] + b[i];

    return 0;
}
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_70(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 70; i++)
        r[i] = a[i] - b[i];

    return 0;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_70(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 64; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[64] = a[64] - b[64];
    r[65] = a[65] - b[65];
    r[66] = a[66] - b[66];
    r[67] = a[67] - b[67];
    r[68] = a[68] - b[68];
    r[69] = a[69] - b[69];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_70(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j, k;
    int64_t c;

    c = ((int64_t)a[69]) * b[69];
    r[139] = (sp_digit)(c >> 22);
    c = (c & 0x3fffff) << 22;
    for (k = 137; k >= 0; k--) {
        for (i = 69; i >= 0; i--) {
            j = k - i;
            if (j >= 70)
                break;
            if (j < 0)
                continue;

            c += ((int64_t)a[i]) * b[j];
        }
        r[k + 2] += c >> 44;
        r[k + 1] = (c >> 22) & 0x3fffff;
        c = (c & 0x3fffff) << 22;
    }
    r[0] = (sp_digit)(c >> 22);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_70(sp_digit* r, const sp_digit* a)
{
    int i, j, k;
    int64_t c;

    c = ((int64_t)a[69]) * a[69];
    r[139] = (sp_digit)(c >> 22);
    c = (c & 0x3fffff) << 22;
    for (k = 137; k >= 0; k--) {
        for (i = 69; i >= 0; i--) {
            j = k - i;
            if (j >= 70 || i <= j)
                break;
            if (j < 0)
                continue;

            c += ((int64_t)a[i]) * a[j] * 2;
        }
        if (i == j)
           c += ((int64_t)a[i]) * a[i];

        r[k + 2] += c >> 44;
        r[k + 1] = (c >> 22) & 0x3fffff;
        c = (c & 0x3fffff) << 22;
    }
    r[0] = (sp_digit)(c >> 22);
}

#endif /* WOLFSSL_SP_SMALL */
#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* Caclulate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_3072_mont_setup(sp_digit* a, sp_digit* rho)
{
    sp_digit x, b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x &= 0x3fffff;

    /* rho = -1/m mod b */
    *rho = (1L << 22) - x;
}

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_3072_mont_norm_70(sp_digit* r, sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<69; i++)
        r[i] = 0x3fffff;
#else
    int i;

    for (i = 0; i < 64; i += 8) {
        r[i + 0] = 0x3fffff;
        r[i + 1] = 0x3fffff;
        r[i + 2] = 0x3fffff;
        r[i + 3] = 0x3fffff;
        r[i + 4] = 0x3fffff;
        r[i + 5] = 0x3fffff;
        r[i + 6] = 0x3fffff;
        r[i + 7] = 0x3fffff;
    }
    r[64] = 0x3fffff;
    r[65] = 0x3fffff;
    r[66] = 0x3fffff;
    r[67] = 0x3fffff;
    r[68] = 0x3fffff;
#endif
    r[69] = 0x3ffffl;

    /* r = (2^n - 1) mod n */
    sp_3072_sub_70(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_3072_cmp_70(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=69; i>=0; i--)
        r |= (a[i] - b[i]) & (0 - !r);
#else
    int i;

    r |= (a[69] - b[69]) & (0 - !r);
    r |= (a[68] - b[68]) & (0 - !r);
    r |= (a[67] - b[67]) & (0 - !r);
    r |= (a[66] - b[66]) & (0 - !r);
    r |= (a[65] - b[65]) & (0 - !r);
    r |= (a[64] - b[64]) & (0 - !r);
    for (i = 56; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & (0 - !r);
        r |= (a[i + 6] - b[i + 6]) & (0 - !r);
        r |= (a[i + 5] - b[i + 5]) & (0 - !r);
        r |= (a[i + 4] - b[i + 4]) & (0 - !r);
        r |= (a[i + 3] - b[i + 3]) & (0 - !r);
        r |= (a[i + 2] - b[i + 2]) & (0 - !r);
        r |= (a[i + 1] - b[i + 1]) & (0 - !r);
        r |= (a[i + 0] - b[i + 0]) & (0 - !r);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_3072_cond_sub_70(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 70; i++)
        r[i] = a[i] - (b[i] & m);
#else
    int i;

    for (i = 0; i < 64; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[64] = a[64] - (b[64] & m);
    r[65] = a[65] - (b[65] & m);
    r[66] = a[66] - (b[66] & m);
    r[67] = a[67] - (b[67] & m);
    r[68] = a[68] - (b[68] & m);
    r[69] = a[69] - (b[69] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_3072_mul_add_70(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int64_t tb = b;
    int64_t t = 0;
    int i;

    for (i = 0; i < 70; i++) {
        t += (tb * a[i]) + r[i];
        r[i] = t & 0x3fffff;
        t >>= 22;
    }
    r[70] += t;
#else
    int64_t tb = b;
    int64_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] += t[0] & 0x3fffff;
    for (i = 0; i < 64; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] += (t[0] >> 22) + (t[1] & 0x3fffff);
        t[2] = tb * a[i+2];
        r[i+2] += (t[1] >> 22) + (t[2] & 0x3fffff);
        t[3] = tb * a[i+3];
        r[i+3] += (t[2] >> 22) + (t[3] & 0x3fffff);
        t[4] = tb * a[i+4];
        r[i+4] += (t[3] >> 22) + (t[4] & 0x3fffff);
        t[5] = tb * a[i+5];
        r[i+5] += (t[4] >> 22) + (t[5] & 0x3fffff);
        t[6] = tb * a[i+6];
        r[i+6] += (t[5] >> 22) + (t[6] & 0x3fffff);
        t[7] = tb * a[i+7];
        r[i+7] += (t[6] >> 22) + (t[7] & 0x3fffff);
        t[0] = tb * a[i+8];
        r[i+8] += (t[7] >> 22) + (t[0] & 0x3fffff);
    }
    t[1] = tb * a[65]; r[65] += (t[0] >> 22) + (t[1] & 0x3fffff);
    t[2] = tb * a[66]; r[66] += (t[1] >> 22) + (t[2] & 0x3fffff);
    t[3] = tb * a[67]; r[67] += (t[2] >> 22) + (t[3] & 0x3fffff);
    t[4] = tb * a[68]; r[68] += (t[3] >> 22) + (t[4] & 0x3fffff);
    t[5] = tb * a[69]; r[69] += (t[4] >> 22) + (t[5] & 0x3fffff);
    r[70] +=  t[5] >> 22;
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 22.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_70(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 69; i++) {
        a[i+1] += a[i] >> 22;
        a[i] &= 0x3fffff;
    }
#else
    int i;
    for (i = 0; i < 64; i += 8) {
        a[i+1] += a[i+0] >> 22; a[i+0] &= 0x3fffff;
        a[i+2] += a[i+1] >> 22; a[i+1] &= 0x3fffff;
        a[i+3] += a[i+2] >> 22; a[i+2] &= 0x3fffff;
        a[i+4] += a[i+3] >> 22; a[i+3] &= 0x3fffff;
        a[i+5] += a[i+4] >> 22; a[i+4] &= 0x3fffff;
        a[i+6] += a[i+5] >> 22; a[i+5] &= 0x3fffff;
        a[i+7] += a[i+6] >> 22; a[i+6] &= 0x3fffff;
        a[i+8] += a[i+7] >> 22; a[i+7] &= 0x3fffff;
        a[i+9] += a[i+8] >> 22; a[i+8] &= 0x3fffff;
    }
    a[64+1] += a[64] >> 22;
    a[64] &= 0x3fffff;
    a[65+1] += a[65] >> 22;
    a[65] &= 0x3fffff;
    a[66+1] += a[66] >> 22;
    a[66] &= 0x3fffff;
    a[67+1] += a[67] >> 22;
    a[67] &= 0x3fffff;
    a[68+1] += a[68] >> 22;
    a[68] &= 0x3fffff;
#endif
}

/* Shift the result in the high 1536 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_3072_mont_shift_70(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    sp_digit n, s;

    s = a[70];
    n = a[69] >> 18;
    for (i = 0; i < 69; i++) {
        n += (s & 0x3fffff) << 4;
        r[i] = n & 0x3fffff;
        n >>= 22;
        s = a[71 + i] + (s >> 22);
    }
    n += s << 4;
    r[69] = n;
#else
    sp_digit n, s;
    int i;

    s = a[70]; n = a[69] >> 18;
    for (i = 0; i < 64; i += 8) {
        n += (s & 0x3fffff) << 4; r[i+0] = n & 0x3fffff;
        n >>= 22; s = a[i+71] + (s >> 22);
        n += (s & 0x3fffff) << 4; r[i+1] = n & 0x3fffff;
        n >>= 22; s = a[i+72] + (s >> 22);
        n += (s & 0x3fffff) << 4; r[i+2] = n & 0x3fffff;
        n >>= 22; s = a[i+73] + (s >> 22);
        n += (s & 0x3fffff) << 4; r[i+3] = n & 0x3fffff;
        n >>= 22; s = a[i+74] + (s >> 22);
        n += (s & 0x3fffff) << 4; r[i+4] = n & 0x3fffff;
        n >>= 22; s = a[i+75] + (s >> 22);
        n += (s & 0x3fffff) << 4; r[i+5] = n & 0x3fffff;
        n >>= 22; s = a[i+76] + (s >> 22);
        n += (s & 0x3fffff) << 4; r[i+6] = n & 0x3fffff;
        n >>= 22; s = a[i+77] + (s >> 22);
        n += (s & 0x3fffff) << 4; r[i+7] = n & 0x3fffff;
        n >>= 22; s = a[i+78] + (s >> 22);
    }
    n += (s & 0x3fffff) << 4; r[64] = n & 0x3fffff;
    n >>= 22; s = a[135] + (s >> 22);
    n += (s & 0x3fffff) << 4; r[65] = n & 0x3fffff;
    n >>= 22; s = a[136] + (s >> 22);
    n += (s & 0x3fffff) << 4; r[66] = n & 0x3fffff;
    n >>= 22; s = a[137] + (s >> 22);
    n += (s & 0x3fffff) << 4; r[67] = n & 0x3fffff;
    n >>= 22; s = a[138] + (s >> 22);
    n += (s & 0x3fffff) << 4; r[68] = n & 0x3fffff;
    n >>= 22; s = a[139] + (s >> 22);
    n += s << 4;              r[69] = n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[70], 0, sizeof(*r) * 70);
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_3072_mont_reduce_70(sp_digit* a, sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;

    for (i=0; i<69; i++) {
        mu = (a[i] * mp) & 0x3fffff;
        sp_3072_mul_add_70(a+i, m, mu);
        a[i+1] += a[i] >> 22;
    }
    mu = (a[i] * mp) & 0x3ffffl;
    sp_3072_mul_add_70(a+i, m, mu);
    a[i+1] += a[i] >> 22;
    a[i] &= 0x3fffff;

    sp_3072_mont_shift_70(a, a);
    sp_3072_cond_sub_70(a, a, m, 0 - ((a[69] >> 18) > 0));
    sp_3072_norm_70(a);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_mul_70(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_3072_mul_70(r, a, b);
    sp_3072_mont_reduce_70(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_sqr_70(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_3072_sqr_70(r, a);
    sp_3072_mont_reduce_70(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_3072_mul_d_70(sp_digit* r, const sp_digit* a, const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int64_t tb = b;
    int64_t t = 0;
    int i;

    for (i = 0; i < 70; i++) {
        t += tb * a[i];
        r[i] = t & 0x3fffff;
        t >>= 22;
    }
    r[70] = (sp_digit)t;
#else
    int64_t tb = b;
    int64_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] = t[0] & 0x3fffff;
    for (i = 0; i < 64; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] = (sp_digit)(t[0] >> 22) + (t[1] & 0x3fffff);
        t[2] = tb * a[i+2];
        r[i+2] = (sp_digit)(t[1] >> 22) + (t[2] & 0x3fffff);
        t[3] = tb * a[i+3];
        r[i+3] = (sp_digit)(t[2] >> 22) + (t[3] & 0x3fffff);
        t[4] = tb * a[i+4];
        r[i+4] = (sp_digit)(t[3] >> 22) + (t[4] & 0x3fffff);
        t[5] = tb * a[i+5];
        r[i+5] = (sp_digit)(t[4] >> 22) + (t[5] & 0x3fffff);
        t[6] = tb * a[i+6];
        r[i+6] = (sp_digit)(t[5] >> 22) + (t[6] & 0x3fffff);
        t[7] = tb * a[i+7];
        r[i+7] = (sp_digit)(t[6] >> 22) + (t[7] & 0x3fffff);
        t[0] = tb * a[i+8];
        r[i+8] = (sp_digit)(t[7] >> 22) + (t[0] & 0x3fffff);
    }
    t[1] = tb * a[65];
    r[65] = (sp_digit)(t[0] >> 22) + (t[1] & 0x3fffff);
    t[2] = tb * a[66];
    r[66] = (sp_digit)(t[1] >> 22) + (t[2] & 0x3fffff);
    t[3] = tb * a[67];
    r[67] = (sp_digit)(t[2] >> 22) + (t[3] & 0x3fffff);
    t[4] = tb * a[68];
    r[68] = (sp_digit)(t[3] >> 22) + (t[4] & 0x3fffff);
    t[5] = tb * a[69];
    r[69] = (sp_digit)(t[4] >> 22) + (t[5] & 0x3fffff);
    r[70] =  (sp_digit)(t[5] >> 22);
#endif /* WOLFSSL_SP_SMALL */
}

/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_3072_cond_add_70(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 70; i++)
        r[i] = a[i] + (b[i] & m);
#else
    int i;

    for (i = 0; i < 64; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[64] = a[64] + (b[64] & m);
    r[65] = a[65] + (b[65] & m);
    r[66] = a[66] + (b[66] & m);
    r[67] = a[67] + (b[67] & m);
    r[68] = a[68] + (b[68] & m);
    r[69] = a[69] + (b[69] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_div_70(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    int i;
    int64_t d1;
    sp_digit div, r1;
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* td;
#else
    sp_digit t1d[140], t2d[140];
#endif
    sp_digit* t1;
    sp_digit* t2;
    int err = MP_OKAY;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    td = XMALLOC(sizeof(sp_digit) * 4 * 70, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    if (td != NULL) {
        t1 = td;
        t2 = td + 2 * 70;
    }
    else
        err = MEMORY_E;
#else
    t1 = t1d;
    t2 = t2d;
#endif

    (void)m;

    if (err == MP_OKAY) {
        div = d[69];
        XMEMCPY(t1, a, sizeof(*t1) * 2 * 70);
        for (i=69; i>=0; i--) {
            t1[70 + i] += t1[70 + i - 1] >> 22;
            t1[70 + i - 1] &= 0x3fffff;
            d1 = t1[70 + i];
            d1 <<= 22;
            d1 += t1[70 + i - 1];
            r1 = (sp_digit)(d1 / div);

            sp_3072_mul_d_70(t2, d, r1);
            sp_3072_sub_70(&t1[i], &t1[i], t2);
            t1[70 + i] -= t2[70];
            t1[70 + i] += t1[70 + i - 1] >> 22;
            t1[70 + i - 1] &= 0x3fffff;
            r1 = (((-t1[70 + i]) << 22) - t1[70 + i - 1]) / div;
            r1++;
            sp_3072_mul_d_70(t2, d, r1);
            sp_3072_add_70(&t1[i], &t1[i], t2);
            t1[70 + i] += t1[70 + i - 1] >> 22;
            t1[70 + i - 1] &= 0x3fffff;
        }
        t1[70 - 1] += t1[70 - 2] >> 22;
        t1[70 - 2] &= 0x3fffff;
        d1 = t1[70 - 1];
        r1 = (sp_digit)(d1 / div);

        sp_3072_mul_d_70(t2, d, r1);
        sp_3072_sub_70(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 2 * 70);
        for (i=0; i<68; i++) {
            r[i+1] += r[i] >> 22;
            r[i] &= 0x3fffff;
        }
        sp_3072_cond_add_70(r, r, d, 0 - (r[69] < 0));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_mod_70(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_3072_div_70(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_3072_mod_exp_70(sp_digit* r, sp_digit* a, sp_digit* e, int bits,
    sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* td;
    sp_digit* t[3];
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 70 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        XMEMSET(td, 0, sizeof(*td) * 3 * 70 * 2);

        norm = t[0] = td;
        t[1] = &td[70 * 2];
        t[2] = &td[2 * 70 * 2];

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_70(norm, m);

        if (reduceA)
            err = sp_3072_mod_70(t[1], a, m);
        else
            XMEMCPY(t[1], a, sizeof(sp_digit) * 70);
    }
    if (err == MP_OKAY) {
        sp_3072_mul_70(t[1], t[1], norm);
        err = sp_3072_mod_70(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 22;
        c = bits % 22;
        n = e[i--] << (22 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 22;
            }

            y = (n >> 21) & 1;
            n <<= 1;

            sp_3072_mont_mul_70(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                    sizeof(*t[2]) * 70 * 2);
            sp_3072_mont_sqr_70(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                    sizeof(*t[2]) * 70 * 2);
        }

        sp_3072_mont_reduce_70(t[0], m, mp);
        n = sp_3072_cmp_70(t[0], m);
        sp_3072_cond_sub_70(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(*r) * 70 * 2);

    }

    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#elif defined(WOLFSSL_SP_CACHE_RESISTANT)
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[3][140];
#else
    sp_digit* td;
    sp_digit* t[3];
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 70 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        t[0] = td;
        t[1] = &td[70 * 2];
        t[2] = &td[2 * 70 * 2];
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_70(norm, m);

        if (reduceA) {
            err = sp_3072_mod_70(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_70(t[1], t[1], norm);
                err = sp_3072_mod_70(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_70(t[1], a, norm);
            err = sp_3072_mod_70(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 22;
        c = bits % 22;
        n = e[i--] << (22 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 22;
            }

            y = (n >> 21) & 1;
            n <<= 1;

            sp_3072_mont_mul_70(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                 ((size_t)t[1] & addr_mask[y])), sizeof(t[2]));
            sp_3072_mont_sqr_70(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                           ((size_t)t[1] & addr_mask[y])), t[2], sizeof(t[2]));
        }

        sp_3072_mont_reduce_70(t[0], m, mp);
        n = sp_3072_cmp_70(t[0], m);
        sp_3072_cond_sub_70(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(t[0]));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][140];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit rt[140];
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 140, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 140;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_70(norm, m);

        if (reduceA) {
            err = sp_3072_mod_70(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_70(t[1], t[1], norm);
                err = sp_3072_mod_70(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_70(t[1], a, norm);
            err = sp_3072_mod_70(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_70(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_70(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_70(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_70(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_70(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_70(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_70(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_70(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_70(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_70(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_70(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_70(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_70(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_70(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_70(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_70(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_70(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_70(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_70(t[20], t[10], m, mp);
        sp_3072_mont_mul_70(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_70(t[22], t[11], m, mp);
        sp_3072_mont_mul_70(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_70(t[24], t[12], m, mp);
        sp_3072_mont_mul_70(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_70(t[26], t[13], m, mp);
        sp_3072_mont_mul_70(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_70(t[28], t[14], m, mp);
        sp_3072_mont_mul_70(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_70(t[30], t[15], m, mp);
        sp_3072_mont_mul_70(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 21) / 22) - 1;
        c = bits % 22;
        if (c == 0)
            c = 22;
        if (i < 70)
            n = e[i--] << (32 - c);
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (10 - c);
            c += 22;
        }
        y = n >> 27;
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(rt));
        for (; i>=0 || c>=5; ) {
            if (c < 5) {
                n |= e[i--] << (10 - c);
                c += 22;
            }
            y = (n >> 27) & 0x1f;
            n <<= 5;
            c -= 5;

            sp_3072_mont_sqr_70(rt, rt, m, mp);
            sp_3072_mont_sqr_70(rt, rt, m, mp);
            sp_3072_mont_sqr_70(rt, rt, m, mp);
            sp_3072_mont_sqr_70(rt, rt, m, mp);
            sp_3072_mont_sqr_70(rt, rt, m, mp);

            sp_3072_mont_mul_70(rt, rt, t[y], m, mp);
        }

        sp_3072_mont_reduce_70(rt, m, mp);
        n = sp_3072_cmp_70(rt, m);
        sp_3072_cond_sub_70(rt, rt, m, (n < 0) - 1);
        XMEMCPY(r, rt, sizeof(rt));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_3072_mont_norm_140(sp_digit* r, sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<139; i++)
        r[i] = 0x3fffff;
#else
    int i;

    for (i = 0; i < 136; i += 8) {
        r[i + 0] = 0x3fffff;
        r[i + 1] = 0x3fffff;
        r[i + 2] = 0x3fffff;
        r[i + 3] = 0x3fffff;
        r[i + 4] = 0x3fffff;
        r[i + 5] = 0x3fffff;
        r[i + 6] = 0x3fffff;
        r[i + 7] = 0x3fffff;
    }
    r[136] = 0x3fffff;
    r[137] = 0x3fffff;
    r[138] = 0x3fffff;
#endif
    r[139] = 0x3fffl;

    /* r = (2^n - 1) mod n */
    sp_3072_sub_140(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_3072_cmp_140(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=139; i>=0; i--)
        r |= (a[i] - b[i]) & (0 - !r);
#else
    int i;

    r |= (a[139] - b[139]) & (0 - !r);
    r |= (a[138] - b[138]) & (0 - !r);
    r |= (a[137] - b[137]) & (0 - !r);
    r |= (a[136] - b[136]) & (0 - !r);
    for (i = 128; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & (0 - !r);
        r |= (a[i + 6] - b[i + 6]) & (0 - !r);
        r |= (a[i + 5] - b[i + 5]) & (0 - !r);
        r |= (a[i + 4] - b[i + 4]) & (0 - !r);
        r |= (a[i + 3] - b[i + 3]) & (0 - !r);
        r |= (a[i + 2] - b[i + 2]) & (0 - !r);
        r |= (a[i + 1] - b[i + 1]) & (0 - !r);
        r |= (a[i + 0] - b[i + 0]) & (0 - !r);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_3072_cond_sub_140(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 140; i++)
        r[i] = a[i] - (b[i] & m);
#else
    int i;

    for (i = 0; i < 136; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[136] = a[136] - (b[136] & m);
    r[137] = a[137] - (b[137] & m);
    r[138] = a[138] - (b[138] & m);
    r[139] = a[139] - (b[139] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_3072_mul_add_140(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int64_t tb = b;
    int64_t t = 0;
    int i;

    for (i = 0; i < 140; i++) {
        t += (tb * a[i]) + r[i];
        r[i] = t & 0x3fffff;
        t >>= 22;
    }
    r[140] += t;
#else
    int64_t tb = b;
    int64_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] += t[0] & 0x3fffff;
    for (i = 0; i < 136; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] += (t[0] >> 22) + (t[1] & 0x3fffff);
        t[2] = tb * a[i+2];
        r[i+2] += (t[1] >> 22) + (t[2] & 0x3fffff);
        t[3] = tb * a[i+3];
        r[i+3] += (t[2] >> 22) + (t[3] & 0x3fffff);
        t[4] = tb * a[i+4];
        r[i+4] += (t[3] >> 22) + (t[4] & 0x3fffff);
        t[5] = tb * a[i+5];
        r[i+5] += (t[4] >> 22) + (t[5] & 0x3fffff);
        t[6] = tb * a[i+6];
        r[i+6] += (t[5] >> 22) + (t[6] & 0x3fffff);
        t[7] = tb * a[i+7];
        r[i+7] += (t[6] >> 22) + (t[7] & 0x3fffff);
        t[0] = tb * a[i+8];
        r[i+8] += (t[7] >> 22) + (t[0] & 0x3fffff);
    }
    t[1] = tb * a[137]; r[137] += (t[0] >> 22) + (t[1] & 0x3fffff);
    t[2] = tb * a[138]; r[138] += (t[1] >> 22) + (t[2] & 0x3fffff);
    t[3] = tb * a[139]; r[139] += (t[2] >> 22) + (t[3] & 0x3fffff);
    r[140] +=  t[3] >> 22;
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 22.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_140(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 139; i++) {
        a[i+1] += a[i] >> 22;
        a[i] &= 0x3fffff;
    }
#else
    int i;
    for (i = 0; i < 136; i += 8) {
        a[i+1] += a[i+0] >> 22; a[i+0] &= 0x3fffff;
        a[i+2] += a[i+1] >> 22; a[i+1] &= 0x3fffff;
        a[i+3] += a[i+2] >> 22; a[i+2] &= 0x3fffff;
        a[i+4] += a[i+3] >> 22; a[i+3] &= 0x3fffff;
        a[i+5] += a[i+4] >> 22; a[i+4] &= 0x3fffff;
        a[i+6] += a[i+5] >> 22; a[i+5] &= 0x3fffff;
        a[i+7] += a[i+6] >> 22; a[i+6] &= 0x3fffff;
        a[i+8] += a[i+7] >> 22; a[i+7] &= 0x3fffff;
        a[i+9] += a[i+8] >> 22; a[i+8] &= 0x3fffff;
    }
    a[136+1] += a[136] >> 22;
    a[136] &= 0x3fffff;
    a[137+1] += a[137] >> 22;
    a[137] &= 0x3fffff;
    a[138+1] += a[138] >> 22;
    a[138] &= 0x3fffff;
#endif
}

/* Shift the result in the high 3072 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_3072_mont_shift_140(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    sp_digit n, s;

    s = a[140];
    n = a[139] >> 14;
    for (i = 0; i < 139; i++) {
        n += (s & 0x3fffff) << 8;
        r[i] = n & 0x3fffff;
        n >>= 22;
        s = a[141 + i] + (s >> 22);
    }
    n += s << 8;
    r[139] = n;
#else
    sp_digit n, s;
    int i;

    s = a[140]; n = a[139] >> 14;
    for (i = 0; i < 136; i += 8) {
        n += (s & 0x3fffff) << 8; r[i+0] = n & 0x3fffff;
        n >>= 22; s = a[i+141] + (s >> 22);
        n += (s & 0x3fffff) << 8; r[i+1] = n & 0x3fffff;
        n >>= 22; s = a[i+142] + (s >> 22);
        n += (s & 0x3fffff) << 8; r[i+2] = n & 0x3fffff;
        n >>= 22; s = a[i+143] + (s >> 22);
        n += (s & 0x3fffff) << 8; r[i+3] = n & 0x3fffff;
        n >>= 22; s = a[i+144] + (s >> 22);
        n += (s & 0x3fffff) << 8; r[i+4] = n & 0x3fffff;
        n >>= 22; s = a[i+145] + (s >> 22);
        n += (s & 0x3fffff) << 8; r[i+5] = n & 0x3fffff;
        n >>= 22; s = a[i+146] + (s >> 22);
        n += (s & 0x3fffff) << 8; r[i+6] = n & 0x3fffff;
        n >>= 22; s = a[i+147] + (s >> 22);
        n += (s & 0x3fffff) << 8; r[i+7] = n & 0x3fffff;
        n >>= 22; s = a[i+148] + (s >> 22);
    }
    n += (s & 0x3fffff) << 8; r[136] = n & 0x3fffff;
    n >>= 22; s = a[277] + (s >> 22);
    n += (s & 0x3fffff) << 8; r[137] = n & 0x3fffff;
    n >>= 22; s = a[278] + (s >> 22);
    n += (s & 0x3fffff) << 8; r[138] = n & 0x3fffff;
    n >>= 22; s = a[279] + (s >> 22);
    n += s << 8;              r[139] = n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[140], 0, sizeof(*r) * 140);
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_3072_mont_reduce_140(sp_digit* a, sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;

    if (mp != 1) {
        for (i=0; i<139; i++) {
            mu = (a[i] * mp) & 0x3fffff;
            sp_3072_mul_add_140(a+i, m, mu);
            a[i+1] += a[i] >> 22;
        }
        mu = (a[i] * mp) & 0x3fffl;
        sp_3072_mul_add_140(a+i, m, mu);
        a[i+1] += a[i] >> 22;
        a[i] &= 0x3fffff;
    }
    else {
        for (i=0; i<139; i++) {
            mu = a[i] & 0x3fffff;
            sp_3072_mul_add_140(a+i, m, mu);
            a[i+1] += a[i] >> 22;
        }
        mu = a[i] & 0x3fffl;
        sp_3072_mul_add_140(a+i, m, mu);
        a[i+1] += a[i] >> 22;
        a[i] &= 0x3fffff;
    }

    sp_3072_mont_shift_140(a, a);
    sp_3072_cond_sub_140(a, a, m, 0 - ((a[139] >> 14) > 0));
    sp_3072_norm_140(a);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_mul_140(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_3072_mul_140(r, a, b);
    sp_3072_mont_reduce_140(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_sqr_140(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_3072_sqr_140(r, a);
    sp_3072_mont_reduce_140(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_3072_mul_d_140(sp_digit* r, const sp_digit* a, const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int64_t tb = b;
    int64_t t = 0;
    int i;

    for (i = 0; i < 140; i++) {
        t += tb * a[i];
        r[i] = t & 0x3fffff;
        t >>= 22;
    }
    r[140] = (sp_digit)t;
#else
    int64_t tb = b;
    int64_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] = t[0] & 0x3fffff;
    for (i = 0; i < 136; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] = (sp_digit)(t[0] >> 22) + (t[1] & 0x3fffff);
        t[2] = tb * a[i+2];
        r[i+2] = (sp_digit)(t[1] >> 22) + (t[2] & 0x3fffff);
        t[3] = tb * a[i+3];
        r[i+3] = (sp_digit)(t[2] >> 22) + (t[3] & 0x3fffff);
        t[4] = tb * a[i+4];
        r[i+4] = (sp_digit)(t[3] >> 22) + (t[4] & 0x3fffff);
        t[5] = tb * a[i+5];
        r[i+5] = (sp_digit)(t[4] >> 22) + (t[5] & 0x3fffff);
        t[6] = tb * a[i+6];
        r[i+6] = (sp_digit)(t[5] >> 22) + (t[6] & 0x3fffff);
        t[7] = tb * a[i+7];
        r[i+7] = (sp_digit)(t[6] >> 22) + (t[7] & 0x3fffff);
        t[0] = tb * a[i+8];
        r[i+8] = (sp_digit)(t[7] >> 22) + (t[0] & 0x3fffff);
    }
    t[1] = tb * a[137];
    r[137] = (sp_digit)(t[0] >> 22) + (t[1] & 0x3fffff);
    t[2] = tb * a[138];
    r[138] = (sp_digit)(t[1] >> 22) + (t[2] & 0x3fffff);
    t[3] = tb * a[139];
    r[139] = (sp_digit)(t[2] >> 22) + (t[3] & 0x3fffff);
    r[140] =  (sp_digit)(t[3] >> 22);
#endif /* WOLFSSL_SP_SMALL */
}

/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_3072_cond_add_140(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 140; i++)
        r[i] = a[i] + (b[i] & m);
#else
    int i;

    for (i = 0; i < 136; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[136] = a[136] + (b[136] & m);
    r[137] = a[137] + (b[137] & m);
    r[138] = a[138] + (b[138] & m);
    r[139] = a[139] + (b[139] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_div_140(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    int i;
    int64_t d1;
    sp_digit div, r1;
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* td;
#else
    sp_digit t1d[280], t2d[280];
#endif
    sp_digit* t1;
    sp_digit* t2;
    int err = MP_OKAY;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    td = XMALLOC(sizeof(sp_digit) * 4 * 140, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    if (td != NULL) {
        t1 = td;
        t2 = td + 2 * 140;
    }
    else
        err = MEMORY_E;
#else
    t1 = t1d;
    t2 = t2d;
#endif

    (void)m;

    if (err == MP_OKAY) {
        div = d[139];
        XMEMCPY(t1, a, sizeof(*t1) * 2 * 140);
        for (i=139; i>=0; i--) {
            t1[140 + i] += t1[140 + i - 1] >> 22;
            t1[140 + i - 1] &= 0x3fffff;
            d1 = t1[140 + i];
            d1 <<= 22;
            d1 += t1[140 + i - 1];
            r1 = (sp_digit)(d1 / div);

            sp_3072_mul_d_140(t2, d, r1);
            sp_3072_sub_140(&t1[i], &t1[i], t2);
            t1[140 + i] -= t2[140];
            t1[140 + i] += t1[140 + i - 1] >> 22;
            t1[140 + i - 1] &= 0x3fffff;
            r1 = (((-t1[140 + i]) << 22) - t1[140 + i - 1]) / div;
            r1++;
            sp_3072_mul_d_140(t2, d, r1);
            sp_3072_add_140(&t1[i], &t1[i], t2);
            t1[140 + i] += t1[140 + i - 1] >> 22;
            t1[140 + i - 1] &= 0x3fffff;
        }
        t1[140 - 1] += t1[140 - 2] >> 22;
        t1[140 - 2] &= 0x3fffff;
        d1 = t1[140 - 1];
        r1 = (sp_digit)(d1 / div);

        sp_3072_mul_d_140(t2, d, r1);
        sp_3072_sub_140(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 2 * 140);
        for (i=0; i<138; i++) {
            r[i+1] += r[i] >> 22;
            r[i] &= 0x3fffff;
        }
        sp_3072_cond_add_140(r, r, d, 0 - (r[139] < 0));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_mod_140(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_3072_div_140(a, m, NULL, r);
}

#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_3072_mod_exp_140(sp_digit* r, sp_digit* a, sp_digit* e, int bits,
    sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* td;
    sp_digit* t[3];
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 140 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        XMEMSET(td, 0, sizeof(*td) * 3 * 140 * 2);

        norm = t[0] = td;
        t[1] = &td[140 * 2];
        t[2] = &td[2 * 140 * 2];

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_140(norm, m);

        if (reduceA)
            err = sp_3072_mod_140(t[1], a, m);
        else
            XMEMCPY(t[1], a, sizeof(sp_digit) * 140);
    }
    if (err == MP_OKAY) {
        sp_3072_mul_140(t[1], t[1], norm);
        err = sp_3072_mod_140(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 22;
        c = bits % 22;
        n = e[i--] << (22 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 22;
            }

            y = (n >> 21) & 1;
            n <<= 1;

            sp_3072_mont_mul_140(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                    sizeof(*t[2]) * 140 * 2);
            sp_3072_mont_sqr_140(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                    sizeof(*t[2]) * 140 * 2);
        }

        sp_3072_mont_reduce_140(t[0], m, mp);
        n = sp_3072_cmp_140(t[0], m);
        sp_3072_cond_sub_140(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(*r) * 140 * 2);

    }

    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#elif defined(WOLFSSL_SP_CACHE_RESISTANT)
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[3][280];
#else
    sp_digit* td;
    sp_digit* t[3];
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 140 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        t[0] = td;
        t[1] = &td[140 * 2];
        t[2] = &td[2 * 140 * 2];
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_140(norm, m);

        if (reduceA) {
            err = sp_3072_mod_140(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_140(t[1], t[1], norm);
                err = sp_3072_mod_140(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_140(t[1], a, norm);
            err = sp_3072_mod_140(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 22;
        c = bits % 22;
        n = e[i--] << (22 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 22;
            }

            y = (n >> 21) & 1;
            n <<= 1;

            sp_3072_mont_mul_140(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                 ((size_t)t[1] & addr_mask[y])), sizeof(t[2]));
            sp_3072_mont_sqr_140(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                           ((size_t)t[1] & addr_mask[y])), t[2], sizeof(t[2]));
        }

        sp_3072_mont_reduce_140(t[0], m, mp);
        n = sp_3072_cmp_140(t[0], m);
        sp_3072_cond_sub_140(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(t[0]));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][280];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit rt[280];
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 280, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 280;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_140(norm, m);

        if (reduceA) {
            err = sp_3072_mod_140(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_140(t[1], t[1], norm);
                err = sp_3072_mod_140(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_140(t[1], a, norm);
            err = sp_3072_mod_140(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_140(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_140(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_140(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_140(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_140(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_140(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_140(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_140(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_140(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_140(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_140(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_140(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_140(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_140(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_140(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_140(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_140(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_140(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_140(t[20], t[10], m, mp);
        sp_3072_mont_mul_140(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_140(t[22], t[11], m, mp);
        sp_3072_mont_mul_140(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_140(t[24], t[12], m, mp);
        sp_3072_mont_mul_140(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_140(t[26], t[13], m, mp);
        sp_3072_mont_mul_140(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_140(t[28], t[14], m, mp);
        sp_3072_mont_mul_140(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_140(t[30], t[15], m, mp);
        sp_3072_mont_mul_140(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 21) / 22) - 1;
        c = bits % 22;
        if (c == 0)
            c = 22;
        if (i < 140)
            n = e[i--] << (32 - c);
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (10 - c);
            c += 22;
        }
        y = n >> 27;
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(rt));
        for (; i>=0 || c>=5; ) {
            if (c < 5) {
                n |= e[i--] << (10 - c);
                c += 22;
            }
            y = (n >> 27) & 0x1f;
            n <<= 5;
            c -= 5;

            sp_3072_mont_sqr_140(rt, rt, m, mp);
            sp_3072_mont_sqr_140(rt, rt, m, mp);
            sp_3072_mont_sqr_140(rt, rt, m, mp);
            sp_3072_mont_sqr_140(rt, rt, m, mp);
            sp_3072_mont_sqr_140(rt, rt, m, mp);

            sp_3072_mont_mul_140(rt, rt, t[y], m, mp);
        }

        sp_3072_mont_reduce_140(rt, m, mp);
        n = sp_3072_cmp_140(rt, m);
        sp_3072_cond_sub_140(rt, rt, m, (n < 0) - 1);
        XMEMCPY(r, rt, sizeof(rt));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_70(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<70; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 64; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
    r[64] = a[64] & m;
    r[65] = a[65] & m;
    r[66] = a[66] & m;
    r[67] = a[67] & m;
    r[68] = a[68] & m;
    r[69] = a[69] & m;
#endif
}

#endif
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_3072(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
    byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* d;
    sp_digit* a;
    sp_digit* m;
    sp_digit* r;
    sp_digit* norm;
    sp_digit e[1];
    sp_digit mp;
    int i;
    int err = MP_OKAY;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 22 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 140 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 140 * 2;
        m = r + 140 * 2;
        norm = r;

        sp_3072_from_bin(a, 140, in, inLen);
#if DIGIT_BIT >= 22
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 140, mm);

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_140(norm, m);
    }
    if (err == MP_OKAY) {
        sp_3072_mul_140(a, a, norm);
        err = sp_3072_mod_140(a, a, m);
    }
    if (err == MP_OKAY) {
        for (i=21; i>=0; i--)
            if (e[0] >> i)
                break;

        XMEMCPY(r, a, sizeof(sp_digit) * 140 * 2);
        for (i--; i>=0; i--) {
            sp_3072_mont_sqr_140(r, r, m, mp);

            if (((e[0] >> i) & 1) == 1)
                sp_3072_mont_mul_140(r, r, a, m, mp);
        }
        sp_3072_mont_reduce_140(r, m, mp);
        mp = sp_3072_cmp_140(r, m);
        sp_3072_cond_sub_140(r, r, m, (mp < 0) - 1);

        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#else
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[280], md[140], rd[280];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* a;
    sp_digit* m;
    sp_digit* r;
    sp_digit e[1];
    int err = MP_OKAY;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 22 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 140 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 140 * 2;
        m = r + 140 * 2;
    }
#else
    a = ad;
    m = md;
    r = rd;
#endif

    if (err == MP_OKAY) {
        sp_3072_from_bin(a, 140, in, inLen);
#if DIGIT_BIT >= 22
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 140, mm);

        if (e[0] == 0x3) {
            if (err == MP_OKAY) {
                sp_3072_sqr_140(r, a);
                err = sp_3072_mod_140(r, r, m);
            }
            if (err == MP_OKAY) {
                sp_3072_mul_140(r, a, r);
                err = sp_3072_mod_140(r, r, m);
            }
        }
        else {
            sp_digit* norm = r;
            int i;
            sp_digit mp;

            sp_3072_mont_setup(m, &mp);
            sp_3072_mont_norm_140(norm, m);

            if (err == MP_OKAY) {
                sp_3072_mul_140(a, a, norm);
                err = sp_3072_mod_140(a, a, m);
            }

            if (err == MP_OKAY) {
                for (i=21; i>=0; i--)
                    if (e[0] >> i)
                        break;

                XMEMCPY(r, a, sizeof(sp_digit) * 280);
                for (i--; i>=0; i--) {
                    sp_3072_mont_sqr_140(r, r, m, mp);

                    if (((e[0] >> i) & 1) == 1)
                        sp_3072_mont_mul_140(r, r, a, m, mp);
                }
                sp_3072_mont_reduce_140(r, m, mp);
                mp = sp_3072_cmp_140(r, m);
                sp_3072_cond_sub_140(r, r, m, (mp < 0) - 1);
            }
        }
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif /* WOLFSSL_SP_SMALL */
}

/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_3072(const byte* in, word32 inLen, mp_int* dm,
    mp_int* pm, mp_int* qm, mp_int* dpm, mp_int* dqm, mp_int* qim, mp_int* mm,
    byte* out, word32* outLen)
{
#ifdef SP_RSA_PRIVATE_EXP_D
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* a;
    sp_digit* d = NULL;
    sp_digit* m;
    sp_digit* r;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(dm) > 3072 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 140 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = d + 140;
        m = a + 140;
        r = a;

        sp_3072_from_bin(a, 140, in, inLen);
        sp_3072_from_mp(d, 140, dm);
        sp_3072_from_mp(m, 140, mm);
        err = sp_3072_mod_exp_140(r, a, d, 3072, m, 0);
    }
    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    if (d != NULL) {
        XMEMSET(d, 0, sizeof(sp_digit) * 140);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }

    return err;
#else
    sp_digit a[280], d[140], m[140];
    sp_digit* r = a;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(dm) > 3072 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        sp_3072_from_bin(a, 140, in, inLen);
        sp_3072_from_mp(d, 140, dm);
        sp_3072_from_mp(m, 140, mm);
        err = sp_3072_mod_exp_140(r, a, d, 3072, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    XMEMSET(d, 0, sizeof(sp_digit) * 140);

    return err;
#endif /* WOLFSSL_SP_SMALL || defined(WOLFSSL_SMALL_STACK) */
#else
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* t = NULL;
    sp_digit* a;
    sp_digit* p;
    sp_digit* q;
    sp_digit* dp;
    sp_digit* dq;
    sp_digit* qi;
    sp_digit* tmp;
    sp_digit* tmpa;
    sp_digit* tmpb;
    sp_digit* r;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 384 || mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 70 * 11, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (t == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = t;
        p = a + 140 * 2;
        q = p + 70;
        qi = dq = dp = q + 70;
        tmpa = qi + 70;
        tmpb = tmpa + 140;

        tmp = t;
        r = tmp + 140;

        sp_3072_from_bin(a, 140, in, inLen);
        sp_3072_from_mp(p, 70, pm);
        sp_3072_from_mp(q, 70, qm);
        sp_3072_from_mp(dp, 70, dpm);
        err = sp_3072_mod_exp_70(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(dq, 70, dqm);
        err = sp_3072_mod_exp_70(tmpb, a, dq, 1536, q, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_sub_70(tmpa, tmpa, tmpb);
        sp_3072_mask_70(tmp, p, tmpa[69] >> 31);
        sp_3072_add_70(tmpa, tmpa, tmp);

        sp_3072_from_mp(qi, 70, qim);
        sp_3072_mul_70(tmpa, tmpa, qi);
        err = sp_3072_mod_70(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_3072_mul_70(tmpa, q, tmpa);
        sp_3072_add_140(r, tmpb, tmpa);
        sp_3072_norm_140(r);

        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    if (t != NULL) {
        XMEMSET(t, 0, sizeof(sp_digit) * 70 * 11);
        XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }

    return err;
#else
    sp_digit a[140 * 2];
    sp_digit p[70], q[70], dp[70], dq[70], qi[70];
    sp_digit tmp[140], tmpa[140], tmpb[140];
    sp_digit* r = a;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 384 || mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        sp_3072_from_bin(a, 140, in, inLen);
        sp_3072_from_mp(p, 70, pm);
        sp_3072_from_mp(q, 70, qm);
        sp_3072_from_mp(dp, 70, dpm);
        sp_3072_from_mp(dq, 70, dqm);
        sp_3072_from_mp(qi, 70, qim);

        err = sp_3072_mod_exp_70(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY)
        err = sp_3072_mod_exp_70(tmpb, a, dq, 1536, q, 1);

    if (err == MP_OKAY) {
        sp_3072_sub_70(tmpa, tmpa, tmpb);
        sp_3072_mask_70(tmp, p, tmpa[69] >> 31);
        sp_3072_add_70(tmpa, tmpa, tmp);
        sp_3072_mul_70(tmpa, tmpa, qi);
        err = sp_3072_mod_70(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_3072_mul_70(tmpa, tmpa, q);
        sp_3072_add_140(r, tmpb, tmpa);
        sp_3072_norm_140(r);

        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    XMEMSET(tmpa, 0, sizeof(tmpa));
    XMEMSET(tmpb, 0, sizeof(tmpb));
    XMEMSET(p, 0, sizeof(p));
    XMEMSET(q, 0, sizeof(q));
    XMEMSET(dp, 0, sizeof(dp));
    XMEMSET(dq, 0, sizeof(dq));
    XMEMSET(qi, 0, sizeof(qi));

    return err;
#endif /* WOLFSSL_SP_SMALL || defined(WOLFSSL_SMALL_STACK) */
#endif /* SP_RSA_PRIVATE_EXP_D */
}

#endif /* WOLFSSL_HAVE_SP_RSA */
#ifdef WOLFSSL_HAVE_SP_DH
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_3072_to_mp(sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (3072 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) {
#if DIGIT_BIT == 22
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 140);
        r->used = 140;
        mp_clamp(r);
#elif DIGIT_BIT < 22
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 140; i++) {
            r->dp[j] |= a[i] << s;
            r->dp[j] &= (1l << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = a[i] >> s;
            while (s + DIGIT_BIT <= 22) {
                s += DIGIT_BIT;
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
                r->dp[++j] = a[i] >> s;
            }
            s = 22 - s;
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 140; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 22 >= DIGIT_BIT) {
    #if DIGIT_BIT < 32
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 22 - s;
            }
            else
                s += 22;
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
    sp_digit* d = NULL;
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072 || expBits > 3072 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 140 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 140 * 2;
        m = e + 140;
        r = b;

        sp_3072_from_mp(b, 140, base);
        sp_3072_from_mp(e, 140, exp);
        sp_3072_from_mp(m, 140, mod);

        err = sp_3072_mod_exp_140(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }

    if (d != NULL) {
        XMEMSET(e, 0, sizeof(sp_digit) * 140);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit bd[280], ed[140], md[140];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072 || expBits > 3072 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

#ifdef WOLFSSL_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 140 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 140 * 2;
        m = e + 140;
        r = b;
    }
#else
    r = b = bd;
    e = ed;
    m = md;
#endif

    if (err == MP_OKAY) {
        sp_3072_from_mp(b, 140, base);
        sp_3072_from_mp(e, 140, exp);
        sp_3072_from_mp(m, 140, mod);

        err = sp_3072_mod_exp_140(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(sp_digit) * 140);

#ifdef WOLFSSL_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 384 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen,
    mp_int* mod, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
    sp_digit* d = NULL;
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    word32 i;

    if (mp_count_bits(base) > 3072 || expLen > 384 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 140 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 140 * 2;
        m = e + 140;
        r = b;

        sp_3072_from_mp(b, 140, base);
        sp_3072_from_bin(e, 140, exp, expLen);
        sp_3072_from_mp(m, 140, mod);

        err = sp_3072_mod_exp_140(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
        for (i=0; i<384 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    if (d != NULL) {
        XMEMSET(e, 0, sizeof(sp_digit) * 140);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit bd[280], ed[140], md[140];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    word32 i;
    int err = MP_OKAY;

    if (mp_count_bits(base) > 3072 || expLen > 384 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

#ifdef WOLFSSL_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 140 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 140 * 2;
        m = e + 140;
        r = b;
    }
#else
    r = b = bd;
    e = ed;
    m = md;
#endif

    if (err == MP_OKAY) {
        sp_3072_from_mp(b, 140, base);
        sp_3072_from_bin(e, 140, exp, expLen);
        sp_3072_from_mp(m, 140, mod);

        err = sp_3072_mod_exp_140(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
        for (i=0; i<384 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    XMEMSET(e, 0, sizeof(sp_digit) * 140);

#ifdef WOLFSSL_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_SP_NO_3072 */
#endif /* SP_WORD_SIZE == 32 */

#endif
#if !defined(WOLFSSL_X86_64_BUILD) || !defined(USE_INTEL_SPEEDUP)
#if SP_WORD_SIZE == 64
#ifndef WOLFSSL_SP_NO_2048
/* Read big endian unsigned byte aray into r.
 *
 * r  A single precision integer.
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_2048_from_bin(sp_digit* r, int max, const byte* a, int n)
{
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= ((sp_digit)a[i]) << s;
        if (s >= 49) {
            r[j] &= 0x1ffffffffffffffl;
            s = 57 - s;
            if (j + 1 >= max)
                break;
            r[++j] = a[i] >> s;
            s = 8 - s;
        }
        else
            s += 8;
    }

    for (j++; j < max; j++)
        r[j] = 0;
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * a  A multi-precision integer.
 */
static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a)
{
#if DIGIT_BIT == 57
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < max; j++)
        r[j] = 0;
#elif DIGIT_BIT > 57
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= a->dp[i] << s;
        r[j] &= 0x1ffffffffffffffl;
        s = 57 - s;
        if (j + 1 >= max)
            break;
        r[++j] = a->dp[i] >> s;
        while (s + 57 <= DIGIT_BIT) {
            s += 57;
            r[j] &= 0x1ffffffffffffffl;
            if (j + 1 >= max)
                break;
            if (s < DIGIT_BIT)
                r[++j] = a->dp[i] >> s;
            else
                r[++j] = 0;
        }
        s = DIGIT_BIT - s;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#else
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 57) {
            r[j] &= 0x1ffffffffffffffl;
            if (j + 1 >= max)
                break;
            s = 57 - s;
            r[++j] = a->dp[i] >> s;
            s = DIGIT_BIT - s;
        }
        else
            s += DIGIT_BIT;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#endif
}

/* Write r as big endian to byte aray.
 * Fixed length number of bytes written: 256
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_2048_to_bin(sp_digit* r, byte* a)
{
    int i, j, s = 0, b;

    for (i=0; i<35; i++) {
        r[i+1] += r[i] >> 57;
        r[i] &= 0x1ffffffffffffffl;
    }
    j = 2048 / 8 - 1;
    a[j] = 0;
    for (i=0; i<36 && j>=0; i++) {
        b = 0;
        a[j--] |= r[i] << s; b += 8 - s;
        if (j < 0)
            break;
        while (b < 57) {
            a[j--] = r[i] >> b; b += 8;
            if (j < 0)
                break;
        }
        if (j < 0)
            break;
        s = 8 - (b - 57);
        a[j] = 0;
        if (s != 0)
            j++;
    }
}

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
    int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
                 + ((int128_t)a[ 1]) * b[ 0];
    int128_t t2   = ((int128_t)a[ 0]) * b[ 2]
                 + ((int128_t)a[ 1]) * b[ 1]
                 + ((int128_t)a[ 2]) * b[ 0];
    int128_t t3   = ((int128_t)a[ 0]) * b[ 3]
                 + ((int128_t)a[ 1]) * b[ 2]
                 + ((int128_t)a[ 2]) * b[ 1]
                 + ((int128_t)a[ 3]) * b[ 0];
    int128_t t4   = ((int128_t)a[ 0]) * b[ 4]
                 + ((int128_t)a[ 1]) * b[ 3]
                 + ((int128_t)a[ 2]) * b[ 2]
                 + ((int128_t)a[ 3]) * b[ 1]
                 + ((int128_t)a[ 4]) * b[ 0];
    int128_t t5   = ((int128_t)a[ 0]) * b[ 5]
                 + ((int128_t)a[ 1]) * b[ 4]
                 + ((int128_t)a[ 2]) * b[ 3]
                 + ((int128_t)a[ 3]) * b[ 2]
                 + ((int128_t)a[ 4]) * b[ 1]
                 + ((int128_t)a[ 5]) * b[ 0];
    int128_t t6   = ((int128_t)a[ 0]) * b[ 6]
                 + ((int128_t)a[ 1]) * b[ 5]
                 + ((int128_t)a[ 2]) * b[ 4]
                 + ((int128_t)a[ 3]) * b[ 3]
                 + ((int128_t)a[ 4]) * b[ 2]
                 + ((int128_t)a[ 5]) * b[ 1]
                 + ((int128_t)a[ 6]) * b[ 0];
    int128_t t7   = ((int128_t)a[ 0]) * b[ 7]
                 + ((int128_t)a[ 1]) * b[ 6]
                 + ((int128_t)a[ 2]) * b[ 5]
                 + ((int128_t)a[ 3]) * b[ 4]
                 + ((int128_t)a[ 4]) * b[ 3]
                 + ((int128_t)a[ 5]) * b[ 2]
                 + ((int128_t)a[ 6]) * b[ 1]
                 + ((int128_t)a[ 7]) * b[ 0];
    int128_t t8   = ((int128_t)a[ 0]) * b[ 8]
                 + ((int128_t)a[ 1]) * b[ 7]
                 + ((int128_t)a[ 2]) * b[ 6]
                 + ((int128_t)a[ 3]) * b[ 5]
                 + ((int128_t)a[ 4]) * b[ 4]
                 + ((int128_t)a[ 5]) * b[ 3]
                 + ((int128_t)a[ 6]) * b[ 2]
                 + ((int128_t)a[ 7]) * b[ 1]
                 + ((int128_t)a[ 8]) * b[ 0];
    int128_t t9   = ((int128_t)a[ 1]) * b[ 8]
                 + ((int128_t)a[ 2]) * b[ 7]
                 + ((int128_t)a[ 3]) * b[ 6]
                 + ((int128_t)a[ 4]) * b[ 5]
                 + ((int128_t)a[ 5]) * b[ 4]
                 + ((int128_t)a[ 6]) * b[ 3]
                 + ((int128_t)a[ 7]) * b[ 2]
                 + ((int128_t)a[ 8]) * b[ 1];
    int128_t t10  = ((int128_t)a[ 2]) * b[ 8]
                 + ((int128_t)a[ 3]) * b[ 7]
                 + ((int128_t)a[ 4]) * b[ 6]
                 + ((int128_t)a[ 5]) * b[ 5]
                 + ((int128_t)a[ 6]) * b[ 4]
                 + ((int128_t)a[ 7]) * b[ 3]
                 + ((int128_t)a[ 8]) * b[ 2];
    int128_t t11  = ((int128_t)a[ 3]) * b[ 8]
                 + ((int128_t)a[ 4]) * b[ 7]
                 + ((int128_t)a[ 5]) * b[ 6]
                 + ((int128_t)a[ 6]) * b[ 5]
                 + ((int128_t)a[ 7]) * b[ 4]
                 + ((int128_t)a[ 8]) * b[ 3];
    int128_t t12  = ((int128_t)a[ 4]) * b[ 8]
                 + ((int128_t)a[ 5]) * b[ 7]
                 + ((int128_t)a[ 6]) * b[ 6]
                 + ((int128_t)a[ 7]) * b[ 5]
                 + ((int128_t)a[ 8]) * b[ 4];
    int128_t t13  = ((int128_t)a[ 5]) * b[ 8]
                 + ((int128_t)a[ 6]) * b[ 7]
                 + ((int128_t)a[ 7]) * b[ 6]
                 + ((int128_t)a[ 8]) * b[ 5];
    int128_t t14  = ((int128_t)a[ 6]) * b[ 8]
                 + ((int128_t)a[ 7]) * b[ 7]
                 + ((int128_t)a[ 8]) * b[ 6];
    int128_t t15  = ((int128_t)a[ 7]) * b[ 8]
                 + ((int128_t)a[ 8]) * b[ 7];
    int128_t t16  = ((int128_t)a[ 8]) * b[ 8];

    t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffl;
    t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffl;
    t3   += t2  >> 57; r[ 2] = t2  & 0x1ffffffffffffffl;
    t4   += t3  >> 57; r[ 3] = t3  & 0x1ffffffffffffffl;
    t5   += t4  >> 57; r[ 4] = t4  & 0x1ffffffffffffffl;
    t6   += t5  >> 57; r[ 5] = t5  & 0x1ffffffffffffffl;
    t7   += t6  >> 57; r[ 6] = t6  & 0x1ffffffffffffffl;
    t8   += t7  >> 57; r[ 7] = t7  & 0x1ffffffffffffffl;
    t9   += t8  >> 57; r[ 8] = t8  & 0x1ffffffffffffffl;
    t10  += t9  >> 57; r[ 9] = t9  & 0x1ffffffffffffffl;
    t11  += t10 >> 57; r[10] = t10 & 0x1ffffffffffffffl;
    t12  += t11 >> 57; r[11] = t11 & 0x1ffffffffffffffl;
    t13  += t12 >> 57; r[12] = t12 & 0x1ffffffffffffffl;
    t14  += t13 >> 57; r[13] = t13 & 0x1ffffffffffffffl;
    t15  += t14 >> 57; r[14] = t14 & 0x1ffffffffffffffl;
    t16  += t15 >> 57; r[15] = t15 & 0x1ffffffffffffffl;
    r[17] = (sp_digit)(t16 >> 57);
                       r[16] = t16 & 0x1ffffffffffffffl;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a)
{
    int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
    int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
    int128_t t2   = (((int128_t)a[ 0]) * a[ 2]) * 2
                 +  ((int128_t)a[ 1]) * a[ 1];
    int128_t t3   = (((int128_t)a[ 0]) * a[ 3]
                 +  ((int128_t)a[ 1]) * a[ 2]) * 2;
    int128_t t4   = (((int128_t)a[ 0]) * a[ 4]
                 +  ((int128_t)a[ 1]) * a[ 3]) * 2
                 +  ((int128_t)a[ 2]) * a[ 2];
    int128_t t5   = (((int128_t)a[ 0]) * a[ 5]
                 +  ((int128_t)a[ 1]) * a[ 4]
                 +  ((int128_t)a[ 2]) * a[ 3]) * 2;
    int128_t t6   = (((int128_t)a[ 0]) * a[ 6]
                 +  ((int128_t)a[ 1]) * a[ 5]
                 +  ((int128_t)a[ 2]) * a[ 4]) * 2
                 +  ((int128_t)a[ 3]) * a[ 3];
    int128_t t7   = (((int128_t)a[ 0]) * a[ 7]
                 +  ((int128_t)a[ 1]) * a[ 6]
                 +  ((int128_t)a[ 2]) * a[ 5]
                 +  ((int128_t)a[ 3]) * a[ 4]) * 2;
    int128_t t8   = (((int128_t)a[ 0]) * a[ 8]
                 +  ((int128_t)a[ 1]) * a[ 7]
                 +  ((int128_t)a[ 2]) * a[ 6]
                 +  ((int128_t)a[ 3]) * a[ 5]) * 2
                 +  ((int128_t)a[ 4]) * a[ 4];
    int128_t t9   = (((int128_t)a[ 1]) * a[ 8]
                 +  ((int128_t)a[ 2]) * a[ 7]
                 +  ((int128_t)a[ 3]) * a[ 6]
                 +  ((int128_t)a[ 4]) * a[ 5]) * 2;
    int128_t t10  = (((int128_t)a[ 2]) * a[ 8]
                 +  ((int128_t)a[ 3]) * a[ 7]
                 +  ((int128_t)a[ 4]) * a[ 6]) * 2
                 +  ((int128_t)a[ 5]) * a[ 5];
    int128_t t11  = (((int128_t)a[ 3]) * a[ 8]
                 +  ((int128_t)a[ 4]) * a[ 7]
                 +  ((int128_t)a[ 5]) * a[ 6]) * 2;
    int128_t t12  = (((int128_t)a[ 4]) * a[ 8]
                 +  ((int128_t)a[ 5]) * a[ 7]) * 2
                 +  ((int128_t)a[ 6]) * a[ 6];
    int128_t t13  = (((int128_t)a[ 5]) * a[ 8]
                 +  ((int128_t)a[ 6]) * a[ 7]) * 2;
    int128_t t14  = (((int128_t)a[ 6]) * a[ 8]) * 2
                 +  ((int128_t)a[ 7]) * a[ 7];
    int128_t t15  = (((int128_t)a[ 7]) * a[ 8]) * 2;
    int128_t t16  =  ((int128_t)a[ 8]) * a[ 8];

    t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffl;
    t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffl;
    t3   += t2  >> 57; r[ 2] = t2  & 0x1ffffffffffffffl;
    t4   += t3  >> 57; r[ 3] = t3  & 0x1ffffffffffffffl;
    t5   += t4  >> 57; r[ 4] = t4  & 0x1ffffffffffffffl;
    t6   += t5  >> 57; r[ 5] = t5  & 0x1ffffffffffffffl;
    t7   += t6  >> 57; r[ 6] = t6  & 0x1ffffffffffffffl;
    t8   += t7  >> 57; r[ 7] = t7  & 0x1ffffffffffffffl;
    t9   += t8  >> 57; r[ 8] = t8  & 0x1ffffffffffffffl;
    t10  += t9  >> 57; r[ 9] = t9  & 0x1ffffffffffffffl;
    t11  += t10 >> 57; r[10] = t10 & 0x1ffffffffffffffl;
    t12  += t11 >> 57; r[11] = t11 & 0x1ffffffffffffffl;
    t13  += t12 >> 57; r[12] = t12 & 0x1ffffffffffffffl;
    t14  += t13 >> 57; r[13] = t13 & 0x1ffffffffffffffl;
    t15  += t14 >> 57; r[14] = t14 & 0x1ffffffffffffffl;
    t16  += t15 >> 57; r[15] = t15 & 0x1ffffffffffffffl;
    r[17] = (sp_digit)(t16 >> 57);
                       r[16] = t16 & 0x1ffffffffffffffl;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_add_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    r[ 0] = a[ 0] + b[ 0];
    r[ 1] = a[ 1] + b[ 1];
    r[ 2] = a[ 2] + b[ 2];
    r[ 3] = a[ 3] + b[ 3];
    r[ 4] = a[ 4] + b[ 4];
    r[ 5] = a[ 5] + b[ 5];
    r[ 6] = a[ 6] + b[ 6];
    r[ 7] = a[ 7] + b[ 7];
    r[ 8] = a[ 8] + b[ 8];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[16] = a[16] + b[16];
    r[17] = a[17] + b[17];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[16] = a[16] - b[16];
    r[17] = a[17] - b[17];

    return 0;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[18];
    sp_digit* a1 = z1;
    sp_digit b1[9];
    sp_digit* z2 = r + 18;
    sp_2048_add_9(a1, a, &a[9]);
    sp_2048_add_9(b1, b, &b[9]);
    sp_2048_mul_9(z2, &a[9], &b[9]);
    sp_2048_mul_9(z0, a, b);
    sp_2048_mul_9(z1, a1, b1);
    sp_2048_sub_18(z1, z1, z2);
    sp_2048_sub_18(z1, z1, z0);
    sp_2048_add_18(r + 9, r + 9, z1);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[18];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 18;
    sp_2048_add_9(a1, a, &a[9]);
    sp_2048_sqr_9(z2, &a[9]);
    sp_2048_sqr_9(z0, a);
    sp_2048_sqr_9(z1, a1);
    sp_2048_sub_18(z1, z1, z2);
    sp_2048_sub_18(z1, z1, z0);
    sp_2048_add_18(r + 9, r + 9, z1);
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[32] = a[32] + b[32];
    r[33] = a[33] + b[33];
    r[34] = a[34] + b[34];
    r[35] = a[35] + b[35];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[32] = a[32] - b[32];
    r[33] = a[33] - b[33];
    r[34] = a[34] - b[34];
    r[35] = a[35] - b[35];

    return 0;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_36(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[36];
    sp_digit* a1 = z1;
    sp_digit b1[18];
    sp_digit* z2 = r + 36;
    sp_2048_add_18(a1, a, &a[18]);
    sp_2048_add_18(b1, b, &b[18]);
    sp_2048_mul_18(z2, &a[18], &b[18]);
    sp_2048_mul_18(z0, a, b);
    sp_2048_mul_18(z1, a1, b1);
    sp_2048_sub_36(z1, z1, z2);
    sp_2048_sub_36(z1, z1, z0);
    sp_2048_add_36(r + 18, r + 18, z1);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[36];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 36;
    sp_2048_add_18(a1, a, &a[18]);
    sp_2048_sqr_18(z2, &a[18]);
    sp_2048_sqr_18(z0, a);
    sp_2048_sqr_18(z1, a1);
    sp_2048_sub_36(z1, z1, z2);
    sp_2048_sub_36(z1, z1, z0);
    sp_2048_add_36(r + 18, r + 18, z1);
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 36; i++)
        r[i] = a[i] + b[i];

    return 0;
}
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 36; i++)
        r[i] = a[i] - b[i];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_36(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j, k;
    int128_t c;

    c = ((int128_t)a[35]) * b[35];
    r[71] = (sp_digit)(c >> 57);
    c = (c & 0x1ffffffffffffffl) << 57;
    for (k = 69; k >= 0; k--) {
        for (i = 35; i >= 0; i--) {
            j = k - i;
            if (j >= 36)
                break;
            if (j < 0)
                continue;

            c += ((int128_t)a[i]) * b[j];
        }
        r[k + 2] += c >> 114;
        r[k + 1] = (c >> 57) & 0x1ffffffffffffffl;
        c = (c & 0x1ffffffffffffffl) << 57;
    }
    r[0] = (sp_digit)(c >> 57);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
{
    int i, j, k;
    int128_t c;

    c = ((int128_t)a[35]) * a[35];
    r[71] = (sp_digit)(c >> 57);
    c = (c & 0x1ffffffffffffffl) << 57;
    for (k = 69; k >= 0; k--) {
        for (i = 35; i >= 0; i--) {
            j = k - i;
            if (j >= 36 || i <= j)
                break;
            if (j < 0)
                continue;

            c += ((int128_t)a[i]) * a[j] * 2;
        }
        if (i == j)
           c += ((int128_t)a[i]) * a[i];

        r[k + 2] += c >> 114;
        r[k + 1] = (c >> 57) & 0x1ffffffffffffffl;
        c = (c & 0x1ffffffffffffffl) << 57;
    }
    r[0] = (sp_digit)(c >> 57);
}

#endif /* WOLFSSL_SP_SMALL */
#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 18; i++)
        r[i] = a[i] + b[i];

    return 0;
}
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 18; i++)
        r[i] = a[i] - b[i];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j, k;
    int128_t c;

    c = ((int128_t)a[17]) * b[17];
    r[35] = (sp_digit)(c >> 57);
    c = (c & 0x1ffffffffffffffl) << 57;
    for (k = 33; k >= 0; k--) {
        for (i = 17; i >= 0; i--) {
            j = k - i;
            if (j >= 18)
                break;
            if (j < 0)
                continue;

            c += ((int128_t)a[i]) * b[j];
        }
        r[k + 2] += c >> 114;
        r[k + 1] = (c >> 57) & 0x1ffffffffffffffl;
        c = (c & 0x1ffffffffffffffl) << 57;
    }
    r[0] = (sp_digit)(c >> 57);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
{
    int i, j, k;
    int128_t c;

    c = ((int128_t)a[17]) * a[17];
    r[35] = (sp_digit)(c >> 57);
    c = (c & 0x1ffffffffffffffl) << 57;
    for (k = 33; k >= 0; k--) {
        for (i = 17; i >= 0; i--) {
            j = k - i;
            if (j >= 18 || i <= j)
                break;
            if (j < 0)
                continue;

            c += ((int128_t)a[i]) * a[j] * 2;
        }
        if (i == j)
           c += ((int128_t)a[i]) * a[i];

        r[k + 2] += c >> 114;
        r[k + 1] = (c >> 57) & 0x1ffffffffffffffl;
        c = (c & 0x1ffffffffffffffl) << 57;
    }
    r[0] = (sp_digit)(c >> 57);
}

#endif /* WOLFSSL_SP_SMALL */
#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* Caclulate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_2048_mont_setup(sp_digit* a, sp_digit* rho)
{
    sp_digit x, b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
    x &= 0x1ffffffffffffffl;

    /* rho = -1/m mod b */
    *rho = (1L << 57) - x;
}

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_2048_mont_norm_18(sp_digit* r, sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<17; i++)
        r[i] = 0x1ffffffffffffffl;
#else
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i + 0] = 0x1ffffffffffffffl;
        r[i + 1] = 0x1ffffffffffffffl;
        r[i + 2] = 0x1ffffffffffffffl;
        r[i + 3] = 0x1ffffffffffffffl;
        r[i + 4] = 0x1ffffffffffffffl;
        r[i + 5] = 0x1ffffffffffffffl;
        r[i + 6] = 0x1ffffffffffffffl;
        r[i + 7] = 0x1ffffffffffffffl;
    }
    r[16] = 0x1ffffffffffffffl;
#endif
    r[17] = 0x7fffffffffffffl;

    /* r = (2^n - 1) mod n */
    sp_2048_sub_18(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_2048_cmp_18(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=17; i>=0; i--)
        r |= (a[i] - b[i]) & (0 - !r);
#else
    int i;

    r |= (a[17] - b[17]) & (0 - !r);
    r |= (a[16] - b[16]) & (0 - !r);
    for (i = 8; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & (0 - !r);
        r |= (a[i + 6] - b[i + 6]) & (0 - !r);
        r |= (a[i + 5] - b[i + 5]) & (0 - !r);
        r |= (a[i + 4] - b[i + 4]) & (0 - !r);
        r |= (a[i + 3] - b[i + 3]) & (0 - !r);
        r |= (a[i + 2] - b[i + 2]) & (0 - !r);
        r |= (a[i + 1] - b[i + 1]) & (0 - !r);
        r |= (a[i + 0] - b[i + 0]) & (0 - !r);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_2048_cond_sub_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 18; i++)
        r[i] = a[i] - (b[i] & m);
#else
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[16] = a[16] - (b[16] & m);
    r[17] = a[17] - (b[17] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_2048_mul_add_18(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int128_t tb = b;
    int128_t t = 0;
    int i;

    for (i = 0; i < 18; i++) {
        t += (tb * a[i]) + r[i];
        r[i] = t & 0x1ffffffffffffffl;
        t >>= 57;
    }
    r[18] += t;
#else
    int128_t tb = b;
    int128_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] += t[0] & 0x1ffffffffffffffl;
    for (i = 0; i < 16; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] += (t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
        t[2] = tb * a[i+2];
        r[i+2] += (t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
        t[3] = tb * a[i+3];
        r[i+3] += (t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
        t[4] = tb * a[i+4];
        r[i+4] += (t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
        t[5] = tb * a[i+5];
        r[i+5] += (t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
        t[6] = tb * a[i+6];
        r[i+6] += (t[5] >> 57) + (t[6] & 0x1ffffffffffffffl);
        t[7] = tb * a[i+7];
        r[i+7] += (t[6] >> 57) + (t[7] & 0x1ffffffffffffffl);
        t[0] = tb * a[i+8];
        r[i+8] += (t[7] >> 57) + (t[0] & 0x1ffffffffffffffl);
    }
    t[1] = tb * a[17]; r[17] += (t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
    r[18] +=  t[1] >> 57;
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 57.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_18(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 17; i++) {
        a[i+1] += a[i] >> 57;
        a[i] &= 0x1ffffffffffffffl;
    }
#else
    int i;
    for (i = 0; i < 16; i += 8) {
        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffl;
        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffl;
        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffl;
        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffl;
        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffl;
        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffl;
        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffl;
        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffl;
        a[i+9] += a[i+8] >> 57; a[i+8] &= 0x1ffffffffffffffl;
    }
    a[16+1] += a[16] >> 57;
    a[16] &= 0x1ffffffffffffffl;
#endif
}

/* Shift the result in the high 1024 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_2048_mont_shift_18(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    word64 n;

    n = a[17] >> 55;
    for (i = 0; i < 17; i++) {
        n += a[18 + i] << 2;
        r[i] = n & 0x1ffffffffffffffl;
        n >>= 57;
    }
    n += a[35] << 2;
    r[17] = n;
#else
    word64 n;
    int i;

    n  = a[17] >> 55;
    for (i = 0; i < 16; i += 8) {
        n += a[i+18] << 2; r[i+0] = n & 0x1ffffffffffffffl; n >>= 57;
        n += a[i+19] << 2; r[i+1] = n & 0x1ffffffffffffffl; n >>= 57;
        n += a[i+20] << 2; r[i+2] = n & 0x1ffffffffffffffl; n >>= 57;
        n += a[i+21] << 2; r[i+3] = n & 0x1ffffffffffffffl; n >>= 57;
        n += a[i+22] << 2; r[i+4] = n & 0x1ffffffffffffffl; n >>= 57;
        n += a[i+23] << 2; r[i+5] = n & 0x1ffffffffffffffl; n >>= 57;
        n += a[i+24] << 2; r[i+6] = n & 0x1ffffffffffffffl; n >>= 57;
        n += a[i+25] << 2; r[i+7] = n & 0x1ffffffffffffffl; n >>= 57;
    }
    n += a[34] << 2; r[16] = n & 0x1ffffffffffffffl; n >>= 57;
    n += a[35] << 2; r[17] = n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[18], 0, sizeof(*r) * 18);
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_2048_mont_reduce_18(sp_digit* a, sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;

    for (i=0; i<17; i++) {
        mu = (a[i] * mp) & 0x1ffffffffffffffl;
        sp_2048_mul_add_18(a+i, m, mu);
        a[i+1] += a[i] >> 57;
    }
    mu = (a[i] * mp) & 0x7fffffffffffffl;
    sp_2048_mul_add_18(a+i, m, mu);
    a[i+1] += a[i] >> 57;
    a[i] &= 0x1ffffffffffffffl;

    sp_2048_mont_shift_18(a, a);
    sp_2048_cond_sub_18(a, a, m, 0 - ((a[17] >> 55) > 0));
    sp_2048_norm_18(a);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_18(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_18(r, a, b);
    sp_2048_mont_reduce_18(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_18(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_18(r, a);
    sp_2048_mont_reduce_18(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_2048_mul_d_18(sp_digit* r, const sp_digit* a, const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int128_t tb = b;
    int128_t t = 0;
    int i;

    for (i = 0; i < 18; i++) {
        t += tb * a[i];
        r[i] = t & 0x1ffffffffffffffl;
        t >>= 57;
    }
    r[18] = (sp_digit)t;
#else
    int128_t tb = b;
    int128_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffl;
    for (i = 0; i < 16; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
        t[2] = tb * a[i+2];
        r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
        t[3] = tb * a[i+3];
        r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
        t[4] = tb * a[i+4];
        r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
        t[5] = tb * a[i+5];
        r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
        t[6] = tb * a[i+6];
        r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffl);
        t[7] = tb * a[i+7];
        r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffl);
        t[0] = tb * a[i+8];
        r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffl);
    }
    t[1] = tb * a[17];
    r[17] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
    r[18] =  (sp_digit)(t[1] >> 57);
#endif /* WOLFSSL_SP_SMALL */
}

/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_2048_cond_add_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 18; i++)
        r[i] = a[i] + (b[i] & m);
#else
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[16] = a[16] + (b[16] & m);
    r[17] = a[17] + (b[17] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_div_18(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    int i;
    int128_t d1;
    sp_digit div, r1;
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* td;
#else
    sp_digit t1d[36], t2d[36];
#endif
    sp_digit* t1;
    sp_digit* t2;
    int err = MP_OKAY;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    td = XMALLOC(sizeof(sp_digit) * 4 * 18, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    if (td != NULL) {
        t1 = td;
        t2 = td + 2 * 18;
    }
    else
        err = MEMORY_E;
#else
    t1 = t1d;
    t2 = t2d;
#endif

    (void)m;

    if (err == MP_OKAY) {
        div = d[17];
        XMEMCPY(t1, a, sizeof(*t1) * 2 * 18);
        for (i=17; i>=0; i--) {
            t1[18 + i] += t1[18 + i - 1] >> 57;
            t1[18 + i - 1] &= 0x1ffffffffffffffl;
            d1 = t1[18 + i];
            d1 <<= 57;
            d1 += t1[18 + i - 1];
            r1 = (sp_digit)(d1 / div);

            sp_2048_mul_d_18(t2, d, r1);
            sp_2048_sub_18(&t1[i], &t1[i], t2);
            t1[18 + i] -= t2[18];
            t1[18 + i] += t1[18 + i - 1] >> 57;
            t1[18 + i - 1] &= 0x1ffffffffffffffl;
            r1 = (((-t1[18 + i]) << 57) - t1[18 + i - 1]) / div;
            r1++;
            sp_2048_mul_d_18(t2, d, r1);
            sp_2048_add_18(&t1[i], &t1[i], t2);
            t1[18 + i] += t1[18 + i - 1] >> 57;
            t1[18 + i - 1] &= 0x1ffffffffffffffl;
        }
        t1[18 - 1] += t1[18 - 2] >> 57;
        t1[18 - 2] &= 0x1ffffffffffffffl;
        d1 = t1[18 - 1];
        r1 = (sp_digit)(d1 / div);

        sp_2048_mul_d_18(t2, d, r1);
        sp_2048_sub_18(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 2 * 18);
        for (i=0; i<16; i++) {
            r[i+1] += r[i] >> 57;
            r[i] &= 0x1ffffffffffffffl;
        }
        sp_2048_cond_add_18(r, r, d, 0 - (r[17] < 0));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_mod_18(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_18(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_18(sp_digit* r, sp_digit* a, sp_digit* e, int bits,
    sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* td;
    sp_digit* t[3];
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 18 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        XMEMSET(td, 0, sizeof(*td) * 3 * 18 * 2);

        norm = t[0] = td;
        t[1] = &td[18 * 2];
        t[2] = &td[2 * 18 * 2];

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_18(norm, m);

        if (reduceA)
            err = sp_2048_mod_18(t[1], a, m);
        else
            XMEMCPY(t[1], a, sizeof(sp_digit) * 18);
    }
    if (err == MP_OKAY) {
        sp_2048_mul_18(t[1], t[1], norm);
        err = sp_2048_mod_18(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 57;
        c = bits % 57;
        n = e[i--] << (57 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 57;
            }

            y = (n >> 56) & 1;
            n <<= 1;

            sp_2048_mont_mul_18(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                    sizeof(*t[2]) * 18 * 2);
            sp_2048_mont_sqr_18(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                    sizeof(*t[2]) * 18 * 2);
        }

        sp_2048_mont_reduce_18(t[0], m, mp);
        n = sp_2048_cmp_18(t[0], m);
        sp_2048_cond_sub_18(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(*r) * 18 * 2);

    }

    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#elif defined(WOLFSSL_SP_CACHE_RESISTANT)
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[3][36];
#else
    sp_digit* td;
    sp_digit* t[3];
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 18 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        t[0] = td;
        t[1] = &td[18 * 2];
        t[2] = &td[2 * 18 * 2];
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_18(norm, m);

        if (reduceA) {
            err = sp_2048_mod_18(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_18(t[1], t[1], norm);
                err = sp_2048_mod_18(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_18(t[1], a, norm);
            err = sp_2048_mod_18(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 57;
        c = bits % 57;
        n = e[i--] << (57 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 57;
            }

            y = (n >> 56) & 1;
            n <<= 1;

            sp_2048_mont_mul_18(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                 ((size_t)t[1] & addr_mask[y])), sizeof(t[2]));
            sp_2048_mont_sqr_18(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                           ((size_t)t[1] & addr_mask[y])), t[2], sizeof(t[2]));
        }

        sp_2048_mont_reduce_18(t[0], m, mp);
        n = sp_2048_cmp_18(t[0], m);
        sp_2048_cond_sub_18(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(t[0]));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][36];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit rt[36];
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 36, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 36;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_18(norm, m);

        if (reduceA) {
            err = sp_2048_mod_18(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_18(t[1], t[1], norm);
                err = sp_2048_mod_18(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_18(t[1], a, norm);
            err = sp_2048_mod_18(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_18(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_18(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_18(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_18(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_18(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_18(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_18(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_18(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_18(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_18(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_18(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_18(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_18(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_18(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_18(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_18(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_18(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_18(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_18(t[20], t[10], m, mp);
        sp_2048_mont_mul_18(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_18(t[22], t[11], m, mp);
        sp_2048_mont_mul_18(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_18(t[24], t[12], m, mp);
        sp_2048_mont_mul_18(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_18(t[26], t[13], m, mp);
        sp_2048_mont_mul_18(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_18(t[28], t[14], m, mp);
        sp_2048_mont_mul_18(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_18(t[30], t[15], m, mp);
        sp_2048_mont_mul_18(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 56) / 57) - 1;
        c = bits % 57;
        if (c == 0)
            c = 57;
        if (i < 18)
            n = e[i--] << (64 - c);
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (7 - c);
            c += 57;
        }
        y = n >> 59;
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(rt));
        for (; i>=0 || c>=5; ) {
            if (c < 5) {
                n |= e[i--] << (7 - c);
                c += 57;
            }
            y = (n >> 59) & 0x1f;
            n <<= 5;
            c -= 5;

            sp_2048_mont_sqr_18(rt, rt, m, mp);
            sp_2048_mont_sqr_18(rt, rt, m, mp);
            sp_2048_mont_sqr_18(rt, rt, m, mp);
            sp_2048_mont_sqr_18(rt, rt, m, mp);
            sp_2048_mont_sqr_18(rt, rt, m, mp);

            sp_2048_mont_mul_18(rt, rt, t[y], m, mp);
        }

        sp_2048_mont_reduce_18(rt, m, mp);
        n = sp_2048_cmp_18(rt, m);
        sp_2048_cond_sub_18(rt, rt, m, (n < 0) - 1);
        XMEMCPY(r, rt, sizeof(rt));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_2048_mont_norm_36(sp_digit* r, sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<35; i++)
        r[i] = 0x1ffffffffffffffl;
#else
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = 0x1ffffffffffffffl;
        r[i + 1] = 0x1ffffffffffffffl;
        r[i + 2] = 0x1ffffffffffffffl;
        r[i + 3] = 0x1ffffffffffffffl;
        r[i + 4] = 0x1ffffffffffffffl;
        r[i + 5] = 0x1ffffffffffffffl;
        r[i + 6] = 0x1ffffffffffffffl;
        r[i + 7] = 0x1ffffffffffffffl;
    }
    r[32] = 0x1ffffffffffffffl;
    r[33] = 0x1ffffffffffffffl;
    r[34] = 0x1ffffffffffffffl;
#endif
    r[35] = 0x1fffffffffffffl;

    /* r = (2^n - 1) mod n */
    sp_2048_sub_36(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_2048_cmp_36(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=35; i>=0; i--)
        r |= (a[i] - b[i]) & (0 - !r);
#else
    int i;

    r |= (a[35] - b[35]) & (0 - !r);
    r |= (a[34] - b[34]) & (0 - !r);
    r |= (a[33] - b[33]) & (0 - !r);
    r |= (a[32] - b[32]) & (0 - !r);
    for (i = 24; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & (0 - !r);
        r |= (a[i + 6] - b[i + 6]) & (0 - !r);
        r |= (a[i + 5] - b[i + 5]) & (0 - !r);
        r |= (a[i + 4] - b[i + 4]) & (0 - !r);
        r |= (a[i + 3] - b[i + 3]) & (0 - !r);
        r |= (a[i + 2] - b[i + 2]) & (0 - !r);
        r |= (a[i + 1] - b[i + 1]) & (0 - !r);
        r |= (a[i + 0] - b[i + 0]) & (0 - !r);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_2048_cond_sub_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 36; i++)
        r[i] = a[i] - (b[i] & m);
#else
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[32] = a[32] - (b[32] & m);
    r[33] = a[33] - (b[33] & m);
    r[34] = a[34] - (b[34] & m);
    r[35] = a[35] - (b[35] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_2048_mul_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int128_t tb = b;
    int128_t t = 0;
    int i;

    for (i = 0; i < 36; i++) {
        t += (tb * a[i]) + r[i];
        r[i] = t & 0x1ffffffffffffffl;
        t >>= 57;
    }
    r[36] += t;
#else
    int128_t tb = b;
    int128_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] += t[0] & 0x1ffffffffffffffl;
    for (i = 0; i < 32; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] += (t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
        t[2] = tb * a[i+2];
        r[i+2] += (t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
        t[3] = tb * a[i+3];
        r[i+3] += (t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
        t[4] = tb * a[i+4];
        r[i+4] += (t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
        t[5] = tb * a[i+5];
        r[i+5] += (t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
        t[6] = tb * a[i+6];
        r[i+6] += (t[5] >> 57) + (t[6] & 0x1ffffffffffffffl);
        t[7] = tb * a[i+7];
        r[i+7] += (t[6] >> 57) + (t[7] & 0x1ffffffffffffffl);
        t[0] = tb * a[i+8];
        r[i+8] += (t[7] >> 57) + (t[0] & 0x1ffffffffffffffl);
    }
    t[1] = tb * a[33]; r[33] += (t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
    t[2] = tb * a[34]; r[34] += (t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
    t[3] = tb * a[35]; r[35] += (t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
    r[36] +=  t[3] >> 57;
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 57.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_36(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 35; i++) {
        a[i+1] += a[i] >> 57;
        a[i] &= 0x1ffffffffffffffl;
    }
#else
    int i;
    for (i = 0; i < 32; i += 8) {
        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffl;
        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffl;
        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffl;
        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffl;
        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffl;
        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffl;
        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffl;
        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffl;
        a[i+9] += a[i+8] >> 57; a[i+8] &= 0x1ffffffffffffffl;
    }
    a[32+1] += a[32] >> 57;
    a[32] &= 0x1ffffffffffffffl;
    a[33+1] += a[33] >> 57;
    a[33] &= 0x1ffffffffffffffl;
    a[34+1] += a[34] >> 57;
    a[34] &= 0x1ffffffffffffffl;
#endif
}

/* Shift the result in the high 2048 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_2048_mont_shift_36(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    sp_digit n, s;

    s = a[36];
    n = a[35] >> 53;
    for (i = 0; i < 35; i++) {
        n += (s & 0x1ffffffffffffffl) << 4;
        r[i] = n & 0x1ffffffffffffffl;
        n >>= 57;
        s = a[37 + i] + (s >> 57);
    }
    n += s << 4;
    r[35] = n;
#else
    sp_digit n, s;
    int i;

    s = a[36]; n = a[35] >> 53;
    for (i = 0; i < 32; i += 8) {
        n += (s & 0x1ffffffffffffffl) << 4; r[i+0] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+37] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 4; r[i+1] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+38] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 4; r[i+2] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+39] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 4; r[i+3] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+40] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 4; r[i+4] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+41] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 4; r[i+5] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+42] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 4; r[i+6] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+43] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 4; r[i+7] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+44] + (s >> 57);
    }
    n += (s & 0x1ffffffffffffffl) << 4; r[32] = n & 0x1ffffffffffffffl;
    n >>= 57; s = a[69] + (s >> 57);
    n += (s & 0x1ffffffffffffffl) << 4; r[33] = n & 0x1ffffffffffffffl;
    n >>= 57; s = a[70] + (s >> 57);
    n += (s & 0x1ffffffffffffffl) << 4; r[34] = n & 0x1ffffffffffffffl;
    n >>= 57; s = a[71] + (s >> 57);
    n += s << 4;              r[35] = n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[36], 0, sizeof(*r) * 36);
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_2048_mont_reduce_36(sp_digit* a, sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;

    if (mp != 1) {
        for (i=0; i<35; i++) {
            mu = (a[i] * mp) & 0x1ffffffffffffffl;
            sp_2048_mul_add_36(a+i, m, mu);
            a[i+1] += a[i] >> 57;
        }
        mu = (a[i] * mp) & 0x1fffffffffffffl;
        sp_2048_mul_add_36(a+i, m, mu);
        a[i+1] += a[i] >> 57;
        a[i] &= 0x1ffffffffffffffl;
    }
    else {
        for (i=0; i<35; i++) {
            mu = a[i] & 0x1ffffffffffffffl;
            sp_2048_mul_add_36(a+i, m, mu);
            a[i+1] += a[i] >> 57;
        }
        mu = a[i] & 0x1fffffffffffffl;
        sp_2048_mul_add_36(a+i, m, mu);
        a[i+1] += a[i] >> 57;
        a[i] &= 0x1ffffffffffffffl;
    }

    sp_2048_mont_shift_36(a, a);
    sp_2048_cond_sub_36(a, a, m, 0 - ((a[35] >> 53) > 0));
    sp_2048_norm_36(a);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_36(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_36(r, a, b);
    sp_2048_mont_reduce_36(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_36(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_36(r, a);
    sp_2048_mont_reduce_36(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a, const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int128_t tb = b;
    int128_t t = 0;
    int i;

    for (i = 0; i < 36; i++) {
        t += tb * a[i];
        r[i] = t & 0x1ffffffffffffffl;
        t >>= 57;
    }
    r[36] = (sp_digit)t;
#else
    int128_t tb = b;
    int128_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffl;
    for (i = 0; i < 32; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
        t[2] = tb * a[i+2];
        r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
        t[3] = tb * a[i+3];
        r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
        t[4] = tb * a[i+4];
        r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
        t[5] = tb * a[i+5];
        r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
        t[6] = tb * a[i+6];
        r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffl);
        t[7] = tb * a[i+7];
        r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffl);
        t[0] = tb * a[i+8];
        r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffl);
    }
    t[1] = tb * a[33];
    r[33] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
    t[2] = tb * a[34];
    r[34] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
    t[3] = tb * a[35];
    r[35] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
    r[36] =  (sp_digit)(t[3] >> 57);
#endif /* WOLFSSL_SP_SMALL */
}

/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 36; i++)
        r[i] = a[i] + (b[i] & m);
#else
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[32] = a[32] + (b[32] & m);
    r[33] = a[33] + (b[33] & m);
    r[34] = a[34] + (b[34] & m);
    r[35] = a[35] + (b[35] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_div_36(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    int i;
    int128_t d1;
    sp_digit div, r1;
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* td;
#else
    sp_digit t1d[72], t2d[72];
#endif
    sp_digit* t1;
    sp_digit* t2;
    int err = MP_OKAY;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    td = XMALLOC(sizeof(sp_digit) * 4 * 36, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    if (td != NULL) {
        t1 = td;
        t2 = td + 2 * 36;
    }
    else
        err = MEMORY_E;
#else
    t1 = t1d;
    t2 = t2d;
#endif

    (void)m;

    if (err == MP_OKAY) {
        div = d[35];
        XMEMCPY(t1, a, sizeof(*t1) * 2 * 36);
        for (i=35; i>=0; i--) {
            t1[36 + i] += t1[36 + i - 1] >> 57;
            t1[36 + i - 1] &= 0x1ffffffffffffffl;
            d1 = t1[36 + i];
            d1 <<= 57;
            d1 += t1[36 + i - 1];
            r1 = (sp_digit)(d1 / div);

            sp_2048_mul_d_36(t2, d, r1);
            sp_2048_sub_36(&t1[i], &t1[i], t2);
            t1[36 + i] -= t2[36];
            t1[36 + i] += t1[36 + i - 1] >> 57;
            t1[36 + i - 1] &= 0x1ffffffffffffffl;
            r1 = (((-t1[36 + i]) << 57) - t1[36 + i - 1]) / div;
            r1++;
            sp_2048_mul_d_36(t2, d, r1);
            sp_2048_add_36(&t1[i], &t1[i], t2);
            t1[36 + i] += t1[36 + i - 1] >> 57;
            t1[36 + i - 1] &= 0x1ffffffffffffffl;
        }
        t1[36 - 1] += t1[36 - 2] >> 57;
        t1[36 - 2] &= 0x1ffffffffffffffl;
        d1 = t1[36 - 1];
        r1 = (sp_digit)(d1 / div);

        sp_2048_mul_d_36(t2, d, r1);
        sp_2048_sub_36(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 2 * 36);
        for (i=0; i<34; i++) {
            r[i+1] += r[i] >> 57;
            r[i] &= 0x1ffffffffffffffl;
        }
        sp_2048_cond_add_36(r, r, d, 0 - (r[35] < 0));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_mod_36(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_36(a, m, NULL, r);
}

#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_36(sp_digit* r, sp_digit* a, sp_digit* e, int bits,
    sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* td;
    sp_digit* t[3];
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 36 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        XMEMSET(td, 0, sizeof(*td) * 3 * 36 * 2);

        norm = t[0] = td;
        t[1] = &td[36 * 2];
        t[2] = &td[2 * 36 * 2];

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_36(norm, m);

        if (reduceA)
            err = sp_2048_mod_36(t[1], a, m);
        else
            XMEMCPY(t[1], a, sizeof(sp_digit) * 36);
    }
    if (err == MP_OKAY) {
        sp_2048_mul_36(t[1], t[1], norm);
        err = sp_2048_mod_36(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 57;
        c = bits % 57;
        n = e[i--] << (57 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 57;
            }

            y = (n >> 56) & 1;
            n <<= 1;

            sp_2048_mont_mul_36(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                    sizeof(*t[2]) * 36 * 2);
            sp_2048_mont_sqr_36(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                    sizeof(*t[2]) * 36 * 2);
        }

        sp_2048_mont_reduce_36(t[0], m, mp);
        n = sp_2048_cmp_36(t[0], m);
        sp_2048_cond_sub_36(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(*r) * 36 * 2);

    }

    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#elif defined(WOLFSSL_SP_CACHE_RESISTANT)
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[3][72];
#else
    sp_digit* td;
    sp_digit* t[3];
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 36 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        t[0] = td;
        t[1] = &td[36 * 2];
        t[2] = &td[2 * 36 * 2];
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_36(norm, m);

        if (reduceA) {
            err = sp_2048_mod_36(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_36(t[1], t[1], norm);
                err = sp_2048_mod_36(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_36(t[1], a, norm);
            err = sp_2048_mod_36(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 57;
        c = bits % 57;
        n = e[i--] << (57 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 57;
            }

            y = (n >> 56) & 1;
            n <<= 1;

            sp_2048_mont_mul_36(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                 ((size_t)t[1] & addr_mask[y])), sizeof(t[2]));
            sp_2048_mont_sqr_36(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                           ((size_t)t[1] & addr_mask[y])), t[2], sizeof(t[2]));
        }

        sp_2048_mont_reduce_36(t[0], m, mp);
        n = sp_2048_cmp_36(t[0], m);
        sp_2048_cond_sub_36(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(t[0]));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][72];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit rt[72];
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 72, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 72;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_36(norm, m);

        if (reduceA) {
            err = sp_2048_mod_36(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_36(t[1], t[1], norm);
                err = sp_2048_mod_36(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_36(t[1], a, norm);
            err = sp_2048_mod_36(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_36(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_36(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_36(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_36(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_36(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_36(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_36(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_36(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_36(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_36(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_36(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_36(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_36(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_36(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_36(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_36(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_36(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_36(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_36(t[20], t[10], m, mp);
        sp_2048_mont_mul_36(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_36(t[22], t[11], m, mp);
        sp_2048_mont_mul_36(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_36(t[24], t[12], m, mp);
        sp_2048_mont_mul_36(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_36(t[26], t[13], m, mp);
        sp_2048_mont_mul_36(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_36(t[28], t[14], m, mp);
        sp_2048_mont_mul_36(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_36(t[30], t[15], m, mp);
        sp_2048_mont_mul_36(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 56) / 57) - 1;
        c = bits % 57;
        if (c == 0)
            c = 57;
        if (i < 36)
            n = e[i--] << (64 - c);
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (7 - c);
            c += 57;
        }
        y = n >> 59;
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(rt));
        for (; i>=0 || c>=5; ) {
            if (c < 5) {
                n |= e[i--] << (7 - c);
                c += 57;
            }
            y = (n >> 59) & 0x1f;
            n <<= 5;
            c -= 5;

            sp_2048_mont_sqr_36(rt, rt, m, mp);
            sp_2048_mont_sqr_36(rt, rt, m, mp);
            sp_2048_mont_sqr_36(rt, rt, m, mp);
            sp_2048_mont_sqr_36(rt, rt, m, mp);
            sp_2048_mont_sqr_36(rt, rt, m, mp);

            sp_2048_mont_mul_36(rt, rt, t[y], m, mp);
        }

        sp_2048_mont_reduce_36(rt, m, mp);
        n = sp_2048_cmp_36(rt, m);
        sp_2048_cond_sub_36(rt, rt, m, (n < 0) - 1);
        XMEMCPY(r, rt, sizeof(rt));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_18(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<18; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
    r[16] = a[16] & m;
    r[17] = a[17] & m;
#endif
}

#endif
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_2048(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
    byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* d;
    sp_digit* a;
    sp_digit* m;
    sp_digit* r;
    sp_digit* norm;
    sp_digit e[1];
    sp_digit mp;
    int i;
    int err = MP_OKAY;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 57 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 36 * 2;
        m = r + 36 * 2;
        norm = r;

        sp_2048_from_bin(a, 36, in, inLen);
#if DIGIT_BIT >= 57
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 36, mm);

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_36(norm, m);
    }
    if (err == MP_OKAY) {
        sp_2048_mul_36(a, a, norm);
        err = sp_2048_mod_36(a, a, m);
    }
    if (err == MP_OKAY) {
        for (i=56; i>=0; i--)
            if (e[0] >> i)
                break;

        XMEMCPY(r, a, sizeof(sp_digit) * 36 * 2);
        for (i--; i>=0; i--) {
            sp_2048_mont_sqr_36(r, r, m, mp);

            if (((e[0] >> i) & 1) == 1)
                sp_2048_mont_mul_36(r, r, a, m, mp);
        }
        sp_2048_mont_reduce_36(r, m, mp);
        mp = sp_2048_cmp_36(r, m);
        sp_2048_cond_sub_36(r, r, m, (mp < 0) - 1);

        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#else
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[72], md[36], rd[72];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* a;
    sp_digit* m;
    sp_digit* r;
    sp_digit e[1];
    int err = MP_OKAY;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 57 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 36 * 2;
        m = r + 36 * 2;
    }
#else
    a = ad;
    m = md;
    r = rd;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_bin(a, 36, in, inLen);
#if DIGIT_BIT >= 57
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 36, mm);

        if (e[0] == 0x3) {
            if (err == MP_OKAY) {
                sp_2048_sqr_36(r, a);
                err = sp_2048_mod_36(r, r, m);
            }
            if (err == MP_OKAY) {
                sp_2048_mul_36(r, a, r);
                err = sp_2048_mod_36(r, r, m);
            }
        }
        else {
            sp_digit* norm = r;
            int i;
            sp_digit mp;

            sp_2048_mont_setup(m, &mp);
            sp_2048_mont_norm_36(norm, m);

            if (err == MP_OKAY) {
                sp_2048_mul_36(a, a, norm);
                err = sp_2048_mod_36(a, a, m);
            }

            if (err == MP_OKAY) {
                for (i=56; i>=0; i--)
                    if (e[0] >> i)
                        break;

                XMEMCPY(r, a, sizeof(sp_digit) * 72);
                for (i--; i>=0; i--) {
                    sp_2048_mont_sqr_36(r, r, m, mp);

                    if (((e[0] >> i) & 1) == 1)
                        sp_2048_mont_mul_36(r, r, a, m, mp);
                }
                sp_2048_mont_reduce_36(r, m, mp);
                mp = sp_2048_cmp_36(r, m);
                sp_2048_cond_sub_36(r, r, m, (mp < 0) - 1);
            }
        }
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif /* WOLFSSL_SP_SMALL */
}

/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_2048(const byte* in, word32 inLen, mp_int* dm,
    mp_int* pm, mp_int* qm, mp_int* dpm, mp_int* dqm, mp_int* qim, mp_int* mm,
    byte* out, word32* outLen)
{
#ifdef SP_RSA_PRIVATE_EXP_D
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* a;
    sp_digit* d = NULL;
    sp_digit* m;
    sp_digit* r;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(dm) > 2048 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = d + 36;
        m = a + 36;
        r = a;

        sp_2048_from_bin(a, 36, in, inLen);
        sp_2048_from_mp(d, 36, dm);
        sp_2048_from_mp(m, 36, mm);
        err = sp_2048_mod_exp_36(r, a, d, 2048, m, 0);
    }
    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    if (d != NULL) {
        XMEMSET(d, 0, sizeof(sp_digit) * 36);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }

    return err;
#else
    sp_digit a[72], d[36], m[36];
    sp_digit* r = a;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(dm) > 2048 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        sp_2048_from_bin(a, 36, in, inLen);
        sp_2048_from_mp(d, 36, dm);
        sp_2048_from_mp(m, 36, mm);
        err = sp_2048_mod_exp_36(r, a, d, 2048, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    XMEMSET(d, 0, sizeof(sp_digit) * 36);

    return err;
#endif /* WOLFSSL_SP_SMALL || defined(WOLFSSL_SMALL_STACK) */
#else
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* t = NULL;
    sp_digit* a;
    sp_digit* p;
    sp_digit* q;
    sp_digit* dp;
    sp_digit* dq;
    sp_digit* qi;
    sp_digit* tmp;
    sp_digit* tmpa;
    sp_digit* tmpb;
    sp_digit* r;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 256 || mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 11, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (t == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = t;
        p = a + 36 * 2;
        q = p + 18;
        qi = dq = dp = q + 18;
        tmpa = qi + 18;
        tmpb = tmpa + 36;

        tmp = t;
        r = tmp + 36;

        sp_2048_from_bin(a, 36, in, inLen);
        sp_2048_from_mp(p, 18, pm);
        sp_2048_from_mp(q, 18, qm);
        sp_2048_from_mp(dp, 18, dpm);
        err = sp_2048_mod_exp_18(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(dq, 18, dqm);
        err = sp_2048_mod_exp_18(tmpb, a, dq, 1024, q, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_sub_18(tmpa, tmpa, tmpb);
        sp_2048_mask_18(tmp, p, tmpa[17] >> 63);
        sp_2048_add_18(tmpa, tmpa, tmp);

        sp_2048_from_mp(qi, 18, qim);
        sp_2048_mul_18(tmpa, tmpa, qi);
        err = sp_2048_mod_18(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_2048_mul_18(tmpa, q, tmpa);
        sp_2048_add_36(r, tmpb, tmpa);
        sp_2048_norm_36(r);

        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    if (t != NULL) {
        XMEMSET(t, 0, sizeof(sp_digit) * 18 * 11);
        XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }

    return err;
#else
    sp_digit a[36 * 2];
    sp_digit p[18], q[18], dp[18], dq[18], qi[18];
    sp_digit tmp[36], tmpa[36], tmpb[36];
    sp_digit* r = a;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 256 || mp_count_bits(mm) != 2048))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        sp_2048_from_bin(a, 36, in, inLen);
        sp_2048_from_mp(p, 18, pm);
        sp_2048_from_mp(q, 18, qm);
        sp_2048_from_mp(dp, 18, dpm);
        sp_2048_from_mp(dq, 18, dqm);
        sp_2048_from_mp(qi, 18, qim);

        err = sp_2048_mod_exp_18(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY)
        err = sp_2048_mod_exp_18(tmpb, a, dq, 1024, q, 1);

    if (err == MP_OKAY) {
        sp_2048_sub_18(tmpa, tmpa, tmpb);
        sp_2048_mask_18(tmp, p, tmpa[17] >> 63);
        sp_2048_add_18(tmpa, tmpa, tmp);
        sp_2048_mul_18(tmpa, tmpa, qi);
        err = sp_2048_mod_18(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_2048_mul_18(tmpa, tmpa, q);
        sp_2048_add_36(r, tmpb, tmpa);
        sp_2048_norm_36(r);

        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

    XMEMSET(tmpa, 0, sizeof(tmpa));
    XMEMSET(tmpb, 0, sizeof(tmpb));
    XMEMSET(p, 0, sizeof(p));
    XMEMSET(q, 0, sizeof(q));
    XMEMSET(dp, 0, sizeof(dp));
    XMEMSET(dq, 0, sizeof(dq));
    XMEMSET(qi, 0, sizeof(qi));

    return err;
#endif /* WOLFSSL_SP_SMALL || defined(WOLFSSL_SMALL_STACK) */
#endif /* SP_RSA_PRIVATE_EXP_D */
}

#endif /* WOLFSSL_HAVE_SP_RSA */
#ifdef WOLFSSL_HAVE_SP_DH
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_2048_to_mp(sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (2048 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) {
#if DIGIT_BIT == 57
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 36);
        r->used = 36;
        mp_clamp(r);
#elif DIGIT_BIT < 57
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 36; i++) {
            r->dp[j] |= a[i] << s;
            r->dp[j] &= (1l << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = a[i] >> s;
            while (s + DIGIT_BIT <= 57) {
                s += DIGIT_BIT;
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
                r->dp[++j] = a[i] >> s;
            }
            s = 57 - s;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 36; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 57 >= DIGIT_BIT) {
    #if DIGIT_BIT < 64
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 57 - s;
            }
            else
                s += 57;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
    sp_digit* d = NULL;
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048 || expBits > 2048 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 36 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 36 * 2;
        m = e + 36;
        r = b;

        sp_2048_from_mp(b, 36, base);
        sp_2048_from_mp(e, 36, exp);
        sp_2048_from_mp(m, 36, mod);

        err = sp_2048_mod_exp_36(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }

    if (d != NULL) {
        XMEMSET(e, 0, sizeof(sp_digit) * 36);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit bd[72], ed[36], md[36];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048 || expBits > 2048 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

#ifdef WOLFSSL_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 36 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 36 * 2;
        m = e + 36;
        r = b;
    }
#else
    r = b = bd;
    e = ed;
    m = md;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 36, base);
        sp_2048_from_mp(e, 36, exp);
        sp_2048_from_mp(m, 36, mod);

        err = sp_2048_mod_exp_36(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(sp_digit) * 36);

#ifdef WOLFSSL_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 256 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen,
    mp_int* mod, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
    sp_digit* d = NULL;
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    word32 i;

    if (mp_count_bits(base) > 2048 || expLen > 256 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 36 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 36 * 2;
        m = e + 36;
        r = b;

        sp_2048_from_mp(b, 36, base);
        sp_2048_from_bin(e, 36, exp, expLen);
        sp_2048_from_mp(m, 36, mod);

        err = sp_2048_mod_exp_36(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
        for (i=0; i<256 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    if (d != NULL) {
        XMEMSET(e, 0, sizeof(sp_digit) * 36);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit bd[72], ed[36], md[36];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    word32 i;
    int err = MP_OKAY;

    if (mp_count_bits(base) > 2048 || expLen > 256 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

#ifdef WOLFSSL_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 36 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 36 * 2;
        m = e + 36;
        r = b;
    }
#else
    r = b = bd;
    e = ed;
    m = md;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 36, base);
        sp_2048_from_bin(e, 36, exp, expLen);
        sp_2048_from_mp(m, 36, mod);

        err = sp_2048_mod_exp_36(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
        for (i=0; i<256 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    XMEMSET(e, 0, sizeof(sp_digit) * 36);

#ifdef WOLFSSL_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_SP_NO_2048 */
#endif /* SP_WORD_SIZE == 64 */

#endif
#if !defined(WOLFSSL_X86_64_BUILD) || !defined(USE_INTEL_SPEEDUP)
#if SP_WORD_SIZE == 64
#ifndef WOLFSSL_SP_NO_3072
/* Read big endian unsigned byte aray into r.
 *
 * r  A single precision integer.
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_3072_from_bin(sp_digit* r, int max, const byte* a, int n)
{
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= ((sp_digit)a[i]) << s;
        if (s >= 49) {
            r[j] &= 0x1ffffffffffffffl;
            s = 57 - s;
            if (j + 1 >= max)
                break;
            r[++j] = a[i] >> s;
            s = 8 - s;
        }
        else
            s += 8;
    }

    for (j++; j < max; j++)
        r[j] = 0;
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * a  A multi-precision integer.
 */
static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a)
{
#if DIGIT_BIT == 57
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < max; j++)
        r[j] = 0;
#elif DIGIT_BIT > 57
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= a->dp[i] << s;
        r[j] &= 0x1ffffffffffffffl;
        s = 57 - s;
        if (j + 1 >= max)
            break;
        r[++j] = a->dp[i] >> s;
        while (s + 57 <= DIGIT_BIT) {
            s += 57;
            r[j] &= 0x1ffffffffffffffl;
            if (j + 1 >= max)
                break;
            if (s < DIGIT_BIT)
                r[++j] = a->dp[i] >> s;
            else
                r[++j] = 0;
        }
        s = DIGIT_BIT - s;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#else
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 57) {
            r[j] &= 0x1ffffffffffffffl;
            if (j + 1 >= max)
                break;
            s = 57 - s;
            r[++j] = a->dp[i] >> s;
            s = DIGIT_BIT - s;
        }
        else
            s += DIGIT_BIT;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#endif
}

/* Write r as big endian to byte aray.
 * Fixed length number of bytes written: 384
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_3072_to_bin(sp_digit* r, byte* a)
{
    int i, j, s = 0, b;

    for (i=0; i<53; i++) {
        r[i+1] += r[i] >> 57;
        r[i] &= 0x1ffffffffffffffl;
    }
    j = 3072 / 8 - 1;
    a[j] = 0;
    for (i=0; i<54 && j>=0; i++) {
        b = 0;
        a[j--] |= r[i] << s; b += 8 - s;
        if (j < 0)
            break;
        while (b < 57) {
            a[j--] = r[i] >> b; b += 8;
            if (j < 0)
                break;
        }
        if (j < 0)
            break;
        s = 8 - (b - 57);
        a[j] = 0;
        if (s != 0)
            j++;
    }
}

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
    int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
                 + ((int128_t)a[ 1]) * b[ 0];
    int128_t t2   = ((int128_t)a[ 0]) * b[ 2]
                 + ((int128_t)a[ 1]) * b[ 1]
                 + ((int128_t)a[ 2]) * b[ 0];
    int128_t t3   = ((int128_t)a[ 0]) * b[ 3]
                 + ((int128_t)a[ 1]) * b[ 2]
                 + ((int128_t)a[ 2]) * b[ 1]
                 + ((int128_t)a[ 3]) * b[ 0];
    int128_t t4   = ((int128_t)a[ 0]) * b[ 4]
                 + ((int128_t)a[ 1]) * b[ 3]
                 + ((int128_t)a[ 2]) * b[ 2]
                 + ((int128_t)a[ 3]) * b[ 1]
                 + ((int128_t)a[ 4]) * b[ 0];
    int128_t t5   = ((int128_t)a[ 0]) * b[ 5]
                 + ((int128_t)a[ 1]) * b[ 4]
                 + ((int128_t)a[ 2]) * b[ 3]
                 + ((int128_t)a[ 3]) * b[ 2]
                 + ((int128_t)a[ 4]) * b[ 1]
                 + ((int128_t)a[ 5]) * b[ 0];
    int128_t t6   = ((int128_t)a[ 0]) * b[ 6]
                 + ((int128_t)a[ 1]) * b[ 5]
                 + ((int128_t)a[ 2]) * b[ 4]
                 + ((int128_t)a[ 3]) * b[ 3]
                 + ((int128_t)a[ 4]) * b[ 2]
                 + ((int128_t)a[ 5]) * b[ 1]
                 + ((int128_t)a[ 6]) * b[ 0];
    int128_t t7   = ((int128_t)a[ 0]) * b[ 7]
                 + ((int128_t)a[ 1]) * b[ 6]
                 + ((int128_t)a[ 2]) * b[ 5]
                 + ((int128_t)a[ 3]) * b[ 4]
                 + ((int128_t)a[ 4]) * b[ 3]
                 + ((int128_t)a[ 5]) * b[ 2]
                 + ((int128_t)a[ 6]) * b[ 1]
                 + ((int128_t)a[ 7]) * b[ 0];
    int128_t t8   = ((int128_t)a[ 0]) * b[ 8]
                 + ((int128_t)a[ 1]) * b[ 7]
                 + ((int128_t)a[ 2]) * b[ 6]
                 + ((int128_t)a[ 3]) * b[ 5]
                 + ((int128_t)a[ 4]) * b[ 4]
                 + ((int128_t)a[ 5]) * b[ 3]
                 + ((int128_t)a[ 6]) * b[ 2]
                 + ((int128_t)a[ 7]) * b[ 1]
                 + ((int128_t)a[ 8]) * b[ 0];
    int128_t t9   = ((int128_t)a[ 1]) * b[ 8]
                 + ((int128_t)a[ 2]) * b[ 7]
                 + ((int128_t)a[ 3]) * b[ 6]
                 + ((int128_t)a[ 4]) * b[ 5]
                 + ((int128_t)a[ 5]) * b[ 4]
                 + ((int128_t)a[ 6]) * b[ 3]
                 + ((int128_t)a[ 7]) * b[ 2]
                 + ((int128_t)a[ 8]) * b[ 1];
    int128_t t10  = ((int128_t)a[ 2]) * b[ 8]
                 + ((int128_t)a[ 3]) * b[ 7]
                 + ((int128_t)a[ 4]) * b[ 6]
                 + ((int128_t)a[ 5]) * b[ 5]
                 + ((int128_t)a[ 6]) * b[ 4]
                 + ((int128_t)a[ 7]) * b[ 3]
                 + ((int128_t)a[ 8]) * b[ 2];
    int128_t t11  = ((int128_t)a[ 3]) * b[ 8]
                 + ((int128_t)a[ 4]) * b[ 7]
                 + ((int128_t)a[ 5]) * b[ 6]
                 + ((int128_t)a[ 6]) * b[ 5]
                 + ((int128_t)a[ 7]) * b[ 4]
                 + ((int128_t)a[ 8]) * b[ 3];
    int128_t t12  = ((int128_t)a[ 4]) * b[ 8]
                 + ((int128_t)a[ 5]) * b[ 7]
                 + ((int128_t)a[ 6]) * b[ 6]
                 + ((int128_t)a[ 7]) * b[ 5]
                 + ((int128_t)a[ 8]) * b[ 4];
    int128_t t13  = ((int128_t)a[ 5]) * b[ 8]
                 + ((int128_t)a[ 6]) * b[ 7]
                 + ((int128_t)a[ 7]) * b[ 6]
                 + ((int128_t)a[ 8]) * b[ 5];
    int128_t t14  = ((int128_t)a[ 6]) * b[ 8]
                 + ((int128_t)a[ 7]) * b[ 7]
                 + ((int128_t)a[ 8]) * b[ 6];
    int128_t t15  = ((int128_t)a[ 7]) * b[ 8]
                 + ((int128_t)a[ 8]) * b[ 7];
    int128_t t16  = ((int128_t)a[ 8]) * b[ 8];

    t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffl;
    t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffl;
    t3   += t2  >> 57; r[ 2] = t2  & 0x1ffffffffffffffl;
    t4   += t3  >> 57; r[ 3] = t3  & 0x1ffffffffffffffl;
    t5   += t4  >> 57; r[ 4] = t4  & 0x1ffffffffffffffl;
    t6   += t5  >> 57; r[ 5] = t5  & 0x1ffffffffffffffl;
    t7   += t6  >> 57; r[ 6] = t6  & 0x1ffffffffffffffl;
    t8   += t7  >> 57; r[ 7] = t7  & 0x1ffffffffffffffl;
    t9   += t8  >> 57; r[ 8] = t8  & 0x1ffffffffffffffl;
    t10  += t9  >> 57; r[ 9] = t9  & 0x1ffffffffffffffl;
    t11  += t10 >> 57; r[10] = t10 & 0x1ffffffffffffffl;
    t12  += t11 >> 57; r[11] = t11 & 0x1ffffffffffffffl;
    t13  += t12 >> 57; r[12] = t12 & 0x1ffffffffffffffl;
    t14  += t13 >> 57; r[13] = t13 & 0x1ffffffffffffffl;
    t15  += t14 >> 57; r[14] = t14 & 0x1ffffffffffffffl;
    t16  += t15 >> 57; r[15] = t15 & 0x1ffffffffffffffl;
    r[17] = (sp_digit)(t16 >> 57);
                       r[16] = t16 & 0x1ffffffffffffffl;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_9(sp_digit* r, const sp_digit* a)
{
    int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
    int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
    int128_t t2   = (((int128_t)a[ 0]) * a[ 2]) * 2
                 +  ((int128_t)a[ 1]) * a[ 1];
    int128_t t3   = (((int128_t)a[ 0]) * a[ 3]
                 +  ((int128_t)a[ 1]) * a[ 2]) * 2;
    int128_t t4   = (((int128_t)a[ 0]) * a[ 4]
                 +  ((int128_t)a[ 1]) * a[ 3]) * 2
                 +  ((int128_t)a[ 2]) * a[ 2];
    int128_t t5   = (((int128_t)a[ 0]) * a[ 5]
                 +  ((int128_t)a[ 1]) * a[ 4]
                 +  ((int128_t)a[ 2]) * a[ 3]) * 2;
    int128_t t6   = (((int128_t)a[ 0]) * a[ 6]
                 +  ((int128_t)a[ 1]) * a[ 5]
                 +  ((int128_t)a[ 2]) * a[ 4]) * 2
                 +  ((int128_t)a[ 3]) * a[ 3];
    int128_t t7   = (((int128_t)a[ 0]) * a[ 7]
                 +  ((int128_t)a[ 1]) * a[ 6]
                 +  ((int128_t)a[ 2]) * a[ 5]
                 +  ((int128_t)a[ 3]) * a[ 4]) * 2;
    int128_t t8   = (((int128_t)a[ 0]) * a[ 8]
                 +  ((int128_t)a[ 1]) * a[ 7]
                 +  ((int128_t)a[ 2]) * a[ 6]
                 +  ((int128_t)a[ 3]) * a[ 5]) * 2
                 +  ((int128_t)a[ 4]) * a[ 4];
    int128_t t9   = (((int128_t)a[ 1]) * a[ 8]
                 +  ((int128_t)a[ 2]) * a[ 7]
                 +  ((int128_t)a[ 3]) * a[ 6]
                 +  ((int128_t)a[ 4]) * a[ 5]) * 2;
    int128_t t10  = (((int128_t)a[ 2]) * a[ 8]
                 +  ((int128_t)a[ 3]) * a[ 7]
                 +  ((int128_t)a[ 4]) * a[ 6]) * 2
                 +  ((int128_t)a[ 5]) * a[ 5];
    int128_t t11  = (((int128_t)a[ 3]) * a[ 8]
                 +  ((int128_t)a[ 4]) * a[ 7]
                 +  ((int128_t)a[ 5]) * a[ 6]) * 2;
    int128_t t12  = (((int128_t)a[ 4]) * a[ 8]
                 +  ((int128_t)a[ 5]) * a[ 7]) * 2
                 +  ((int128_t)a[ 6]) * a[ 6];
    int128_t t13  = (((int128_t)a[ 5]) * a[ 8]
                 +  ((int128_t)a[ 6]) * a[ 7]) * 2;
    int128_t t14  = (((int128_t)a[ 6]) * a[ 8]) * 2
                 +  ((int128_t)a[ 7]) * a[ 7];
    int128_t t15  = (((int128_t)a[ 7]) * a[ 8]) * 2;
    int128_t t16  =  ((int128_t)a[ 8]) * a[ 8];

    t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffl;
    t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffl;
    t3   += t2  >> 57; r[ 2] = t2  & 0x1ffffffffffffffl;
    t4   += t3  >> 57; r[ 3] = t3  & 0x1ffffffffffffffl;
    t5   += t4  >> 57; r[ 4] = t4  & 0x1ffffffffffffffl;
    t6   += t5  >> 57; r[ 5] = t5  & 0x1ffffffffffffffl;
    t7   += t6  >> 57; r[ 6] = t6  & 0x1ffffffffffffffl;
    t8   += t7  >> 57; r[ 7] = t7  & 0x1ffffffffffffffl;
    t9   += t8  >> 57; r[ 8] = t8  & 0x1ffffffffffffffl;
    t10  += t9  >> 57; r[ 9] = t9  & 0x1ffffffffffffffl;
    t11  += t10 >> 57; r[10] = t10 & 0x1ffffffffffffffl;
    t12  += t11 >> 57; r[11] = t11 & 0x1ffffffffffffffl;
    t13  += t12 >> 57; r[12] = t12 & 0x1ffffffffffffffl;
    t14  += t13 >> 57; r[13] = t13 & 0x1ffffffffffffffl;
    t15  += t14 >> 57; r[14] = t14 & 0x1ffffffffffffffl;
    t16  += t15 >> 57; r[15] = t15 & 0x1ffffffffffffffl;
    r[17] = (sp_digit)(t16 >> 57);
                       r[16] = t16 & 0x1ffffffffffffffl;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    r[ 0] = a[ 0] + b[ 0];
    r[ 1] = a[ 1] + b[ 1];
    r[ 2] = a[ 2] + b[ 2];
    r[ 3] = a[ 3] + b[ 3];
    r[ 4] = a[ 4] + b[ 4];
    r[ 5] = a[ 5] + b[ 5];
    r[ 6] = a[ 6] + b[ 6];
    r[ 7] = a[ 7] + b[ 7];
    r[ 8] = a[ 8] + b[ 8];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[16] = a[16] + b[16];
    r[17] = a[17] + b[17];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[16] = a[16] - b[16];
    r[17] = a[17] - b[17];

    return 0;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[18];
    sp_digit* a1 = z1;
    sp_digit b1[9];
    sp_digit* z2 = r + 18;
    sp_3072_add_9(a1, a, &a[9]);
    sp_3072_add_9(b1, b, &b[9]);
    sp_3072_mul_9(z2, &a[9], &b[9]);
    sp_3072_mul_9(z0, a, b);
    sp_3072_mul_9(z1, a1, b1);
    sp_3072_sub_18(z1, z1, z2);
    sp_3072_sub_18(z1, z1, z0);
    sp_3072_add_18(r + 9, r + 9, z1);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_18(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[18];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 18;
    sp_3072_add_9(a1, a, &a[9]);
    sp_3072_sqr_9(z2, &a[9]);
    sp_3072_sqr_9(z0, a);
    sp_3072_sqr_9(z1, a1);
    sp_3072_sub_18(z1, z1, z2);
    sp_3072_sub_18(z1, z1, z0);
    sp_3072_add_18(r + 9, r + 9, z1);
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[32] = a[32] - b[32];
    r[33] = a[33] - b[33];
    r[34] = a[34] - b[34];
    r[35] = a[35] - b[35];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[32] = a[32] + b[32];
    r[33] = a[33] + b[33];
    r[34] = a[34] + b[34];
    r[35] = a[35] + b[35];

    return 0;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    sp_digit p0[36];
    sp_digit p1[36];
    sp_digit p2[36];
    sp_digit p3[36];
    sp_digit p4[36];
    sp_digit p5[36];
    sp_digit t0[36];
    sp_digit t1[36];
    sp_digit t2[36];
    sp_digit a0[18];
    sp_digit a1[18];
    sp_digit a2[18];
    sp_digit b0[18];
    sp_digit b1[18];
    sp_digit b2[18];
    sp_3072_add_18(a0, a, &a[18]);
    sp_3072_add_18(b0, b, &b[18]);
    sp_3072_add_18(a1, &a[18], &a[36]);
    sp_3072_add_18(b1, &b[18], &b[36]);
    sp_3072_add_18(a2, a0, &a[36]);
    sp_3072_add_18(b2, b0, &b[36]);
    sp_3072_mul_18(p0, a, b);
    sp_3072_mul_18(p2, &a[18], &b[18]);
    sp_3072_mul_18(p4, &a[36], &b[36]);
    sp_3072_mul_18(p1, a0, b0);
    sp_3072_mul_18(p3, a1, b1);
    sp_3072_mul_18(p5, a2, b2);
    XMEMSET(r, 0, sizeof(*r)*2*54);
    sp_3072_sub_36(t0, p3, p2);
    sp_3072_sub_36(t1, p1, p2);
    sp_3072_sub_36(t2, p5, t0);
    sp_3072_sub_36(t2, t2, t1);
    sp_3072_sub_36(t0, t0, p4);
    sp_3072_sub_36(t1, t1, p0);
    sp_3072_add_36(r, r, p0);
    sp_3072_add_36(&r[18], &r[18], t1);
    sp_3072_add_36(&r[36], &r[36], t2);
    sp_3072_add_36(&r[54], &r[54], t0);
    sp_3072_add_36(&r[72], &r[72], p4);
}

/* Square a into r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
{
    sp_digit p0[36];
    sp_digit p1[36];
    sp_digit p2[36];
    sp_digit p3[36];
    sp_digit p4[36];
    sp_digit p5[36];
    sp_digit t0[36];
    sp_digit t1[36];
    sp_digit t2[36];
    sp_digit a0[18];
    sp_digit a1[18];
    sp_digit a2[18];
    sp_3072_add_18(a0, a, &a[18]);
    sp_3072_add_18(a1, &a[18], &a[36]);
    sp_3072_add_18(a2, a0, &a[36]);
    sp_3072_sqr_18(p0, a);
    sp_3072_sqr_18(p2, &a[18]);
    sp_3072_sqr_18(p4, &a[36]);
    sp_3072_sqr_18(p1, a0);
    sp_3072_sqr_18(p3, a1);
    sp_3072_sqr_18(p5, a2);
    XMEMSET(r, 0, sizeof(*r)*2*54);
    sp_3072_sub_36(t0, p3, p2);
    sp_3072_sub_36(t1, p1, p2);
    sp_3072_sub_36(t2, p5, t0);
    sp_3072_sub_36(t2, t2, t1);
    sp_3072_sub_36(t0, t0, p4);
    sp_3072_sub_36(t1, t1, p0);
    sp_3072_add_36(r, r, p0);
    sp_3072_add_36(&r[18], &r[18], t1);
    sp_3072_add_36(&r[36], &r[36], t2);
    sp_3072_add_36(&r[54], &r[54], t0);
    sp_3072_add_36(&r[72], &r[72], p4);
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 54; i++)
        r[i] = a[i] + b[i];

    return 0;
}
#else
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[48] = a[48] + b[48];
    r[49] = a[49] + b[49];
    r[50] = a[50] + b[50];
    r[51] = a[51] + b[51];
    r[52] = a[52] + b[52];
    r[53] = a[53] + b[53];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 54; i++)
        r[i] = a[i] - b[i];

    return 0;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[48] = a[48] - b[48];
    r[49] = a[49] - b[49];
    r[50] = a[50] - b[50];
    r[51] = a[51] - b[51];
    r[52] = a[52] - b[52];
    r[53] = a[53] - b[53];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j, k;
    int128_t c;

    c = ((int128_t)a[53]) * b[53];
    r[107] = (sp_digit)(c >> 57);
    c = (c & 0x1ffffffffffffffl) << 57;
    for (k = 105; k >= 0; k--) {
        for (i = 53; i >= 0; i--) {
            j = k - i;
            if (j >= 54)
                break;
            if (j < 0)
                continue;

            c += ((int128_t)a[i]) * b[j];
        }
        r[k + 2] += c >> 114;
        r[k + 1] = (c >> 57) & 0x1ffffffffffffffl;
        c = (c & 0x1ffffffffffffffl) << 57;
    }
    r[0] = (sp_digit)(c >> 57);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
{
    int i, j, k;
    int128_t c;

    c = ((int128_t)a[53]) * a[53];
    r[107] = (sp_digit)(c >> 57);
    c = (c & 0x1ffffffffffffffl) << 57;
    for (k = 105; k >= 0; k--) {
        for (i = 53; i >= 0; i--) {
            j = k - i;
            if (j >= 54 || i <= j)
                break;
            if (j < 0)
                continue;

            c += ((int128_t)a[i]) * a[j] * 2;
        }
        if (i == j)
           c += ((int128_t)a[i]) * a[i];

        r[k + 2] += c >> 114;
        r[k + 1] = (c >> 57) & 0x1ffffffffffffffl;
        c = (c & 0x1ffffffffffffffl) << 57;
    }
    r[0] = (sp_digit)(c >> 57);
}

#endif /* WOLFSSL_SP_SMALL */
#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 27; i++)
        r[i] = a[i] + b[i];

    return 0;
}
#else
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[24] = a[24] + b[24];
    r[25] = a[25] + b[25];
    r[26] = a[26] + b[26];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 27; i++)
        r[i] = a[i] - b[i];

    return 0;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[24] = a[24] - b[24];
    r[25] = a[25] - b[25];
    r[26] = a[26] - b[26];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_27(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j, k;
    int128_t c;

    c = ((int128_t)a[26]) * b[26];
    r[53] = (sp_digit)(c >> 57);
    c = (c & 0x1ffffffffffffffl) << 57;
    for (k = 51; k >= 0; k--) {
        for (i = 26; i >= 0; i--) {
            j = k - i;
            if (j >= 27)
                break;
            if (j < 0)
                continue;

            c += ((int128_t)a[i]) * b[j];
        }
        r[k + 2] += c >> 114;
        r[k + 1] = (c >> 57) & 0x1ffffffffffffffl;
        c = (c & 0x1ffffffffffffffl) << 57;
    }
    r[0] = (sp_digit)(c >> 57);
}

#else
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_27(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    int i, j;
    int128_t t[54];

    XMEMSET(t, 0, sizeof(t));
    for (i=0; i<27; i++) {
        for (j=0; j<27; j++)
            t[i+j] += ((int128_t)a[i]) * b[j];
    }
    for (i=0; i<53; i++) {
        r[i] = t[i] & 0x1ffffffffffffffl;
        t[i+1] += t[i] >> 57;
    }
    r[53] = (sp_digit)t[53];
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
{
    int i, j, k;
    int128_t c;

    c = ((int128_t)a[26]) * a[26];
    r[53] = (sp_digit)(c >> 57);
    c = (c & 0x1ffffffffffffffl) << 57;
    for (k = 51; k >= 0; k--) {
        for (i = 26; i >= 0; i--) {
            j = k - i;
            if (j >= 27 || i <= j)
                break;
            if (j < 0)
                continue;

            c += ((int128_t)a[i]) * a[j] * 2;
        }
        if (i == j)
           c += ((int128_t)a[i]) * a[i];

        r[k + 2] += c >> 114;
        r[k + 1] = (c >> 57) & 0x1ffffffffffffffl;
        c = (c & 0x1ffffffffffffffl) << 57;
    }
    r[0] = (sp_digit)(c >> 57);
}

#else
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
{
    int i, j;
    int128_t t[54];

    XMEMSET(t, 0, sizeof(t));
    for (i=0; i<27; i++) {
        for (j=0; j<i; j++)
            t[i+j] += (((int128_t)a[i]) * a[j]) * 2;
        t[i+i] += ((int128_t)a[i]) * a[i];
    }
    for (i=0; i<53; i++) {
        r[i] = t[i] & 0x1ffffffffffffffl;
        t[i+1] += t[i] >> 57;
    }
    r[53] = (sp_digit)t[53];
}

#endif /* WOLFSSL_SP_SMALL */
#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* Caclulate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_3072_mont_setup(sp_digit* a, sp_digit* rho)
{
    sp_digit x, b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
    x &= 0x1ffffffffffffffl;

    /* rho = -1/m mod b */
    *rho = (1L << 57) - x;
}

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_3072_mont_norm_27(sp_digit* r, sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<26; i++)
        r[i] = 0x1ffffffffffffffl;
#else
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = 0x1ffffffffffffffl;
        r[i + 1] = 0x1ffffffffffffffl;
        r[i + 2] = 0x1ffffffffffffffl;
        r[i + 3] = 0x1ffffffffffffffl;
        r[i + 4] = 0x1ffffffffffffffl;
        r[i + 5] = 0x1ffffffffffffffl;
        r[i + 6] = 0x1ffffffffffffffl;
        r[i + 7] = 0x1ffffffffffffffl;
    }
    r[24] = 0x1ffffffffffffffl;
    r[25] = 0x1ffffffffffffffl;
#endif
    r[26] = 0x3fffffffffffffl;

    /* r = (2^n - 1) mod n */
    sp_3072_sub_27(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_3072_cmp_27(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=26; i>=0; i--)
        r |= (a[i] - b[i]) & (0 - !r);
#else
    int i;

    r |= (a[26] - b[26]) & (0 - !r);
    r |= (a[25] - b[25]) & (0 - !r);
    r |= (a[24] - b[24]) & (0 - !r);
    for (i = 16; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & (0 - !r);
        r |= (a[i + 6] - b[i + 6]) & (0 - !r);
        r |= (a[i + 5] - b[i + 5]) & (0 - !r);
        r |= (a[i + 4] - b[i + 4]) & (0 - !r);
        r |= (a[i + 3] - b[i + 3]) & (0 - !r);
        r |= (a[i + 2] - b[i + 2]) & (0 - !r);
        r |= (a[i + 1] - b[i + 1]) & (0 - !r);
        r |= (a[i + 0] - b[i + 0]) & (0 - !r);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_3072_cond_sub_27(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 27; i++)
        r[i] = a[i] - (b[i] & m);
#else
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[24] = a[24] - (b[24] & m);
    r[25] = a[25] - (b[25] & m);
    r[26] = a[26] - (b[26] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_3072_mul_add_27(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int128_t tb = b;
    int128_t t = 0;
    int i;

    for (i = 0; i < 27; i++) {
        t += (tb * a[i]) + r[i];
        r[i] = t & 0x1ffffffffffffffl;
        t >>= 57;
    }
    r[27] += t;
#else
    int128_t tb = b;
    int128_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] += t[0] & 0x1ffffffffffffffl;
    for (i = 0; i < 24; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] += (t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
        t[2] = tb * a[i+2];
        r[i+2] += (t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
        t[3] = tb * a[i+3];
        r[i+3] += (t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
        t[4] = tb * a[i+4];
        r[i+4] += (t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
        t[5] = tb * a[i+5];
        r[i+5] += (t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
        t[6] = tb * a[i+6];
        r[i+6] += (t[5] >> 57) + (t[6] & 0x1ffffffffffffffl);
        t[7] = tb * a[i+7];
        r[i+7] += (t[6] >> 57) + (t[7] & 0x1ffffffffffffffl);
        t[0] = tb * a[i+8];
        r[i+8] += (t[7] >> 57) + (t[0] & 0x1ffffffffffffffl);
    }
    t[1] = tb * a[25]; r[25] += (t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
    t[2] = tb * a[26]; r[26] += (t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
    r[27] +=  t[2] >> 57;
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 57.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_27(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 26; i++) {
        a[i+1] += a[i] >> 57;
        a[i] &= 0x1ffffffffffffffl;
    }
#else
    int i;
    for (i = 0; i < 24; i += 8) {
        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffl;
        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffl;
        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffl;
        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffl;
        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffl;
        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffl;
        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffl;
        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffl;
        a[i+9] += a[i+8] >> 57; a[i+8] &= 0x1ffffffffffffffl;
    }
    a[24+1] += a[24] >> 57;
    a[24] &= 0x1ffffffffffffffl;
    a[25+1] += a[25] >> 57;
    a[25] &= 0x1ffffffffffffffl;
#endif
}

/* Shift the result in the high 1536 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_3072_mont_shift_27(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    sp_digit n, s;

    s = a[27];
    n = a[26] >> 54;
    for (i = 0; i < 26; i++) {
        n += (s & 0x1ffffffffffffffl) << 3;
        r[i] = n & 0x1ffffffffffffffl;
        n >>= 57;
        s = a[28 + i] + (s >> 57);
    }
    n += s << 3;
    r[26] = n;
#else
    sp_digit n, s;
    int i;

    s = a[27]; n = a[26] >> 54;
    for (i = 0; i < 24; i += 8) {
        n += (s & 0x1ffffffffffffffl) << 3; r[i+0] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+28] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 3; r[i+1] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+29] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 3; r[i+2] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+30] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 3; r[i+3] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+31] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 3; r[i+4] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+32] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 3; r[i+5] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+33] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 3; r[i+6] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+34] + (s >> 57);
        n += (s & 0x1ffffffffffffffl) << 3; r[i+7] = n & 0x1ffffffffffffffl;
        n >>= 57; s = a[i+35] + (s >> 57);
    }
    n += (s & 0x1ffffffffffffffl) << 3; r[24] = n & 0x1ffffffffffffffl;
    n >>= 57; s = a[52] + (s >> 57);
    n += (s & 0x1ffffffffffffffl) << 3; r[25] = n & 0x1ffffffffffffffl;
    n >>= 57; s = a[53] + (s >> 57);
    n += s << 3;              r[26] = n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[27], 0, sizeof(*r) * 27);
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_3072_mont_reduce_27(sp_digit* a, sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;

    for (i=0; i<26; i++) {
        mu = (a[i] * mp) & 0x1ffffffffffffffl;
        sp_3072_mul_add_27(a+i, m, mu);
        a[i+1] += a[i] >> 57;
    }
    mu = (a[i] * mp) & 0x3fffffffffffffl;
    sp_3072_mul_add_27(a+i, m, mu);
    a[i+1] += a[i] >> 57;
    a[i] &= 0x1ffffffffffffffl;

    sp_3072_mont_shift_27(a, a);
    sp_3072_cond_sub_27(a, a, m, 0 - ((a[26] >> 54) > 0));
    sp_3072_norm_27(a);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_mul_27(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_3072_mul_27(r, a, b);
    sp_3072_mont_reduce_27(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_sqr_27(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_3072_sqr_27(r, a);
    sp_3072_mont_reduce_27(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_3072_mul_d_27(sp_digit* r, const sp_digit* a, const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int128_t tb = b;
    int128_t t = 0;
    int i;

    for (i = 0; i < 27; i++) {
        t += tb * a[i];
        r[i] = t & 0x1ffffffffffffffl;
        t >>= 57;
    }
    r[27] = (sp_digit)t;
#else
    int128_t tb = b;
    int128_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffl;
    for (i = 0; i < 24; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
        t[2] = tb * a[i+2];
        r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
        t[3] = tb * a[i+3];
        r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
        t[4] = tb * a[i+4];
        r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
        t[5] = tb * a[i+5];
        r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
        t[6] = tb * a[i+6];
        r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffl);
        t[7] = tb * a[i+7];
        r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffl);
        t[0] = tb * a[i+8];
        r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffl);
    }
    t[1] = tb * a[25];
    r[25] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
    t[2] = tb * a[26];
    r[26] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
    r[27] =  (sp_digit)(t[2] >> 57);
#endif /* WOLFSSL_SP_SMALL */
}

/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_3072_cond_add_27(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 27; i++)
        r[i] = a[i] + (b[i] & m);
#else
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[24] = a[24] + (b[24] & m);
    r[25] = a[25] + (b[25] & m);
    r[26] = a[26] + (b[26] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_div_27(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    int i;
    int128_t d1;
    sp_digit div, r1;
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* td;
#else
    sp_digit t1d[54], t2d[54];
#endif
    sp_digit* t1;
    sp_digit* t2;
    int err = MP_OKAY;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    td = XMALLOC(sizeof(sp_digit) * 4 * 27, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    if (td != NULL) {
        t1 = td;
        t2 = td + 2 * 27;
    }
    else
        err = MEMORY_E;
#else
    t1 = t1d;
    t2 = t2d;
#endif

    (void)m;

    if (err == MP_OKAY) {
        div = d[26];
        XMEMCPY(t1, a, sizeof(*t1) * 2 * 27);
        for (i=26; i>=0; i--) {
            t1[27 + i] += t1[27 + i - 1] >> 57;
            t1[27 + i - 1] &= 0x1ffffffffffffffl;
            d1 = t1[27 + i];
            d1 <<= 57;
            d1 += t1[27 + i - 1];
            r1 = (sp_digit)(d1 / div);

            sp_3072_mul_d_27(t2, d, r1);
            sp_3072_sub_27(&t1[i], &t1[i], t2);
            t1[27 + i] -= t2[27];
            t1[27 + i] += t1[27 + i - 1] >> 57;
            t1[27 + i - 1] &= 0x1ffffffffffffffl;
            r1 = (((-t1[27 + i]) << 57) - t1[27 + i - 1]) / div;
            r1++;
            sp_3072_mul_d_27(t2, d, r1);
            sp_3072_add_27(&t1[i], &t1[i], t2);
            t1[27 + i] += t1[27 + i - 1] >> 57;
            t1[27 + i - 1] &= 0x1ffffffffffffffl;
        }
        t1[27 - 1] += t1[27 - 2] >> 57;
        t1[27 - 2] &= 0x1ffffffffffffffl;
        d1 = t1[27 - 1];
        r1 = (sp_digit)(d1 / div);

        sp_3072_mul_d_27(t2, d, r1);
        sp_3072_sub_27(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 2 * 27);
        for (i=0; i<25; i++) {
            r[i+1] += r[i] >> 57;
            r[i] &= 0x1ffffffffffffffl;
        }
        sp_3072_cond_add_27(r, r, d, 0 - (r[26] < 0));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_mod_27(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_3072_div_27(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_3072_mod_exp_27(sp_digit* r, sp_digit* a, sp_digit* e, int bits,
    sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* td;
    sp_digit* t[3];
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 27 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        XMEMSET(td, 0, sizeof(*td) * 3 * 27 * 2);

        norm = t[0] = td;
        t[1] = &td[27 * 2];
        t[2] = &td[2 * 27 * 2];

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_27(norm, m);

        if (reduceA)
            err = sp_3072_mod_27(t[1], a, m);
        else
            XMEMCPY(t[1], a, sizeof(sp_digit) * 27);
    }
    if (err == MP_OKAY) {
        sp_3072_mul_27(t[1], t[1], norm);
        err = sp_3072_mod_27(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 57;
        c = bits % 57;
        n = e[i--] << (57 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 57;
            }

            y = (n >> 56) & 1;
            n <<= 1;

            sp_3072_mont_mul_27(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                    sizeof(*t[2]) * 27 * 2);
            sp_3072_mont_sqr_27(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                    sizeof(*t[2]) * 27 * 2);
        }

        sp_3072_mont_reduce_27(t[0], m, mp);
        n = sp_3072_cmp_27(t[0], m);
        sp_3072_cond_sub_27(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(*r) * 27 * 2);

    }

    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#elif defined(WOLFSSL_SP_CACHE_RESISTANT)
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[3][54];
#else
    sp_digit* td;
    sp_digit* t[3];
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 27 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        t[0] = td;
        t[1] = &td[27 * 2];
        t[2] = &td[2 * 27 * 2];
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_27(norm, m);

        if (reduceA) {
            err = sp_3072_mod_27(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_27(t[1], t[1], norm);
                err = sp_3072_mod_27(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_27(t[1], a, norm);
            err = sp_3072_mod_27(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 57;
        c = bits % 57;
        n = e[i--] << (57 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 57;
            }

            y = (n >> 56) & 1;
            n <<= 1;

            sp_3072_mont_mul_27(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                 ((size_t)t[1] & addr_mask[y])), sizeof(t[2]));
            sp_3072_mont_sqr_27(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                           ((size_t)t[1] & addr_mask[y])), t[2], sizeof(t[2]));
        }

        sp_3072_mont_reduce_27(t[0], m, mp);
        n = sp_3072_cmp_27(t[0], m);
        sp_3072_cond_sub_27(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(t[0]));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][54];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit rt[54];
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 54, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 54;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_27(norm, m);

        if (reduceA) {
            err = sp_3072_mod_27(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_27(t[1], t[1], norm);
                err = sp_3072_mod_27(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_27(t[1], a, norm);
            err = sp_3072_mod_27(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_27(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_27(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_27(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_27(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_27(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_27(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_27(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_27(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_27(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_27(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_27(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_27(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_27(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_27(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_27(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_27(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_27(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_27(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_27(t[20], t[10], m, mp);
        sp_3072_mont_mul_27(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_27(t[22], t[11], m, mp);
        sp_3072_mont_mul_27(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_27(t[24], t[12], m, mp);
        sp_3072_mont_mul_27(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_27(t[26], t[13], m, mp);
        sp_3072_mont_mul_27(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_27(t[28], t[14], m, mp);
        sp_3072_mont_mul_27(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_27(t[30], t[15], m, mp);
        sp_3072_mont_mul_27(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 56) / 57) - 1;
        c = bits % 57;
        if (c == 0)
            c = 57;
        if (i < 27)
            n = e[i--] << (64 - c);
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (7 - c);
            c += 57;
        }
        y = n >> 59;
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(rt));
        for (; i>=0 || c>=5; ) {
            if (c < 5) {
                n |= e[i--] << (7 - c);
                c += 57;
            }
            y = (n >> 59) & 0x1f;
            n <<= 5;
            c -= 5;

            sp_3072_mont_sqr_27(rt, rt, m, mp);
            sp_3072_mont_sqr_27(rt, rt, m, mp);
            sp_3072_mont_sqr_27(rt, rt, m, mp);
            sp_3072_mont_sqr_27(rt, rt, m, mp);
            sp_3072_mont_sqr_27(rt, rt, m, mp);

            sp_3072_mont_mul_27(rt, rt, t[y], m, mp);
        }

        sp_3072_mont_reduce_27(rt, m, mp);
        n = sp_3072_cmp_27(rt, m);
        sp_3072_cond_sub_27(rt, rt, m, (n < 0) - 1);
        XMEMCPY(r, rt, sizeof(rt));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_3072_mont_norm_54(sp_digit* r, sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<53; i++)
        r[i] = 0x1ffffffffffffffl;
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = 0x1ffffffffffffffl;
        r[i + 1] = 0x1ffffffffffffffl;
        r[i + 2] = 0x1ffffffffffffffl;
        r[i + 3] = 0x1ffffffffffffffl;
        r[i + 4] = 0x1ffffffffffffffl;
        r[i + 5] = 0x1ffffffffffffffl;
        r[i + 6] = 0x1ffffffffffffffl;
        r[i + 7] = 0x1ffffffffffffffl;
    }
    r[48] = 0x1ffffffffffffffl;
    r[49] = 0x1ffffffffffffffl;
    r[50] = 0x1ffffffffffffffl;
    r[51] = 0x1ffffffffffffffl;
    r[52] = 0x1ffffffffffffffl;
#endif
    r[53] = 0x7ffffffffffffl;

    /* r = (2^n - 1) mod n */
    sp_3072_sub_54(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_3072_cmp_54(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=53; i>=0; i--)
        r |= (a[i] - b[i]) & (0 - !r);
#else
    int i;

    r |= (a[53] - b[53]) & (0 - !r);
    r |= (a[52] - b[52]) & (0 - !r);
    r |= (a[51] - b[51]) & (0 - !r);
    r |= (a[50] - b[50]) & (0 - !r);
    r |= (a[49] - b[49]) & (0 - !r);
    r |= (a[48] - b[48]) & (0 - !r);
    for (i = 40; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & (0 - !r);
        r |= (a[i + 6] - b[i + 6]) & (0 - !r);
        r |= (a[i + 5] - b[i + 5]) & (0 - !r);
        r |= (a[i + 4] - b[i + 4]) & (0 - !r);
        r |= (a[i + 3] - b[i + 3]) & (0 - !r);
        r |= (a[i + 2] - b[i + 2]) & (0 - !r);
        r |= (a[i + 1] - b[i + 1]) & (0 - !r);
        r |= (a[i + 0] - b[i + 0]) & (0 - !r);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_3072_cond_sub_54(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 54; i++)
        r[i] = a[i] - (b[i] & m);
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[48] = a[48] - (b[48] & m);
    r[49] = a[49] - (b[49] & m);
    r[50] = a[50] - (b[50] & m);
    r[51] = a[51] - (b[51] & m);
    r[52] = a[52] - (b[52] & m);
    r[53] = a[53] - (b[53] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_3072_mul_add_54(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int128_t tb = b;
    int128_t t = 0;
    int i;

    for (i = 0; i < 54; i++) {
        t += (tb * a[i]) + r[i];
        r[i] = t & 0x1ffffffffffffffl;
        t >>= 57;
    }
    r[54] += t;
#else
    int128_t tb = b;
    int128_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] += t[0] & 0x1ffffffffffffffl;
    for (i = 0; i < 48; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] += (t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
        t[2] = tb * a[i+2];
        r[i+2] += (t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
        t[3] = tb * a[i+3];
        r[i+3] += (t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
        t[4] = tb * a[i+4];
        r[i+4] += (t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
        t[5] = tb * a[i+5];
        r[i+5] += (t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
        t[6] = tb * a[i+6];
        r[i+6] += (t[5] >> 57) + (t[6] & 0x1ffffffffffffffl);
        t[7] = tb * a[i+7];
        r[i+7] += (t[6] >> 57) + (t[7] & 0x1ffffffffffffffl);
        t[0] = tb * a[i+8];
        r[i+8] += (t[7] >> 57) + (t[0] & 0x1ffffffffffffffl);
    }
    t[1] = tb * a[49]; r[49] += (t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
    t[2] = tb * a[50]; r[50] += (t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
    t[3] = tb * a[51]; r[51] += (t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
    t[4] = tb * a[52]; r[52] += (t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
    t[5] = tb * a[53]; r[53] += (t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
    r[54] +=  t[5] >> 57;
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 57.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_54(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 53; i++) {
        a[i+1] += a[i] >> 57;
        a[i] &= 0x1ffffffffffffffl;
    }
#else
    int i;
    for (i = 0; i < 48; i += 8) {
        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffl;
        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffl;
        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffl;
        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffl;
        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffl;
        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffl;
        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffl;
        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffl;
        a[i+9] += a[i+8] >> 57; a[i+8] &= 0x1ffffffffffffffl;
    }
    a[48+1] += a[48] >> 57;
    a[48] &= 0x1ffffffffffffffl;
    a[49+1] += a[49] >> 57;
    a[49] &= 0x1ffffffffffffffl;
    a[50+1] += a[50] >> 57;
    a[50] &= 0x1ffffffffffffffl;
    a[51+1] += a[51] >> 57;
    a[51] &= 0x1ffffffffffffffl;
    a[52+1] += a[52] >> 57;
    a[52] &= 0x1ffffffffffffffl;
#endif
}

/* Shift the result in the high 3072 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_3072_mont_shift_54(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    int128_t n = a[53] >> 51;
    n += ((int128_t)a[54]) << 6;

    for (i = 0; i < 53; i++) {
        r[i] = n & 0x1ffffffffffffffl;
        n >>= 57;
        n += ((int128_t)a[55 + i]) << 6;
    }
    r[53] = (sp_digit)n;
#else
    int i;
    int128_t n = a[53] >> 51;
    n += ((int128_t)a[54]) << 6;
    for (i = 0; i < 48; i += 8) {
        r[i + 0] = n & 0x1ffffffffffffffl;
        n >>= 57; n += ((int128_t)a[i + 55]) << 6;
        r[i + 1] = n & 0x1ffffffffffffffl;
        n >>= 57; n += ((int128_t)a[i + 56]) << 6;
        r[i + 2] = n & 0x1ffffffffffffffl;
        n >>= 57; n += ((int128_t)a[i + 57]) << 6;
        r[i + 3] = n & 0x1ffffffffffffffl;
        n >>= 57; n += ((int128_t)a[i + 58]) << 6;
        r[i + 4] = n & 0x1ffffffffffffffl;
        n >>= 57; n += ((int128_t)a[i + 59]) << 6;
        r[i + 5] = n & 0x1ffffffffffffffl;
        n >>= 57; n += ((int128_t)a[i + 60]) << 6;
        r[i + 6] = n & 0x1ffffffffffffffl;
        n >>= 57; n += ((int128_t)a[i + 61]) << 6;
        r[i + 7] = n & 0x1ffffffffffffffl;
        n >>= 57; n += ((int128_t)a[i + 62]) << 6;
    }
    r[48] = n & 0x1ffffffffffffffl; n >>= 57; n += ((int128_t)a[103]) << 6;
    r[49] = n & 0x1ffffffffffffffl; n >>= 57; n += ((int128_t)a[104]) << 6;
    r[50] = n & 0x1ffffffffffffffl; n >>= 57; n += ((int128_t)a[105]) << 6;
    r[51] = n & 0x1ffffffffffffffl; n >>= 57; n += ((int128_t)a[106]) << 6;
    r[52] = n & 0x1ffffffffffffffl; n >>= 57; n += ((int128_t)a[107]) << 6;
    r[53] = (sp_digit)n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[54], 0, sizeof(*r) * 54);
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_3072_mont_reduce_54(sp_digit* a, sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;

    if (mp != 1) {
        for (i=0; i<53; i++) {
            mu = (a[i] * mp) & 0x1ffffffffffffffl;
            sp_3072_mul_add_54(a+i, m, mu);
            a[i+1] += a[i] >> 57;
        }
        mu = (a[i] * mp) & 0x7ffffffffffffl;
        sp_3072_mul_add_54(a+i, m, mu);
        a[i+1] += a[i] >> 57;
        a[i] &= 0x1ffffffffffffffl;
    }
    else {
        for (i=0; i<53; i++) {
            mu = a[i] & 0x1ffffffffffffffl;
            sp_3072_mul_add_54(a+i, m, mu);
            a[i+1] += a[i] >> 57;
        }
        mu = a[i] & 0x7ffffffffffffl;
        sp_3072_mul_add_54(a+i, m, mu);
        a[i+1] += a[i] >> 57;
        a[i] &= 0x1ffffffffffffffl;
    }

    sp_3072_mont_shift_54(a, a);
    sp_3072_cond_sub_54(a, a, m, 0 - ((a[53] >> 51) > 0));
    sp_3072_norm_54(a);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_mul_54(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_3072_mul_54(r, a, b);
    sp_3072_mont_reduce_54(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_sqr_54(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_3072_sqr_54(r, a);
    sp_3072_mont_reduce_54(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
static void sp_3072_mul_d_54(sp_digit* r, const sp_digit* a, const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    int128_t tb = b;
    int128_t t = 0;
    int i;

    for (i = 0; i < 54; i++) {
        t += tb * a[i];
        r[i] = t & 0x1ffffffffffffffl;
        t >>= 57;
    }
    r[54] = (sp_digit)t;
#else
    int128_t tb = b;
    int128_t t[8];
    int i;

    t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffl;
    for (i = 0; i < 48; i += 8) {
        t[1] = tb * a[i+1];
        r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
        t[2] = tb * a[i+2];
        r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
        t[3] = tb * a[i+3];
        r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
        t[4] = tb * a[i+4];
        r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
        t[5] = tb * a[i+5];
        r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
        t[6] = tb * a[i+6];
        r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffl);
        t[7] = tb * a[i+7];
        r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffl);
        t[0] = tb * a[i+8];
        r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffl);
    }
    t[1] = tb * a[49];
    r[49] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffl);
    t[2] = tb * a[50];
    r[50] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffl);
    t[3] = tb * a[51];
    r[51] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffl);
    t[4] = tb * a[52];
    r[52] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffl);
    t[5] = tb * a[53];
    r[53] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffl);
    r[54] =  (sp_digit)(t[5] >> 57);
#endif /* WOLFSSL_SP_SMALL */
}

/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_3072_cond_add_54(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 54; i++)
        r[i] = a[i] + (b[i] & m);
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[48] = a[48] + (b[48] & m);
    r[49] = a[49] + (b[49] & m);
    r[50] = a[50] + (b[50] & m);
    r[51] = a[51] + (b[51] & m);
    r[52] = a[52] + (b[52] & m);
    r[53] = a[53] + (b[53] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_div_54(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    int i;
    int128_t d1;
    sp_digit div, r1;
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* td;
#else
    sp_digit t1d[108], t2d[108];
#endif
    sp_digit* t1;
    sp_digit* t2;
    int err = MP_OKAY;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    td = XMALLOC(sizeof(sp_digit) * 4 * 54, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    if (td != NULL) {
        t1 = td;
        t2 = td + 2 * 54;
    }
    else
        err = MEMORY_E;
#else
    t1 = t1d;
    t2 = t2d;
#endif

    (void)m;

    if (err == MP_OKAY) {
        div = d[53];
        XMEMCPY(t1, a, sizeof(*t1) * 2 * 54);
        for (i=53; i>=0; i--) {
            t1[54 + i] += t1[54 + i - 1] >> 57;
            t1[54 + i - 1] &= 0x1ffffffffffffffl;
            d1 = t1[54 + i];
            d1 <<= 57;
            d1 += t1[54 + i - 1];
            r1 = (sp_digit)(d1 / div);

            sp_3072_mul_d_54(t2, d, r1);
            sp_3072_sub_54(&t1[i], &t1[i], t2);
            t1[54 + i] -= t2[54];
            t1[54 + i] += t1[54 + i - 1] >> 57;
            t1[54 + i - 1] &= 0x1ffffffffffffffl;
            r1 = (((-t1[54 + i]) << 57) - t1[54 + i - 1]) / div;
            r1++;
            sp_3072_mul_d_54(t2, d, r1);
            sp_3072_add_54(&t1[i], &t1[i], t2);
            t1[54 + i] += t1[54 + i - 1] >> 57;
            t1[54 + i - 1] &= 0x1ffffffffffffffl;
        }
        t1[54 - 1] += t1[54 - 2] >> 57;
        t1[54 - 2] &= 0x1ffffffffffffffl;
        d1 = t1[54 - 1];
        r1 = (sp_digit)(d1 / div);

        sp_3072_mul_d_54(t2, d, r1);
        sp_3072_sub_54(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 2 * 54);
        for (i=0; i<52; i++) {
            r[i+1] += r[i] >> 57;
            r[i] &= 0x1ffffffffffffffl;
        }
        sp_3072_cond_add_54(r, r, d, 0 - (r[53] < 0));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_mod_54(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_3072_div_54(a, m, NULL, r);
}

#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_3072_mod_exp_54(sp_digit* r, sp_digit* a, sp_digit* e, int bits,
    sp_digit* m, int reduceA)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* td;
    sp_digit* t[3];
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 54 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        XMEMSET(td, 0, sizeof(*td) * 3 * 54 * 2);

        norm = t[0] = td;
        t[1] = &td[54 * 2];
        t[2] = &td[2 * 54 * 2];

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_54(norm, m);

        if (reduceA)
            err = sp_3072_mod_54(t[1], a, m);
        else
            XMEMCPY(t[1], a, sizeof(sp_digit) * 54);
    }
    if (err == MP_OKAY) {
        sp_3072_mul_54(t[1], t[1], norm);
        err = sp_3072_mod_54(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 57;
        c = bits % 57;
        n = e[i--] << (57 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 57;
            }

            y = (n >> 56) & 1;
            n <<= 1;

            sp_3072_mont_mul_54(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                    sizeof(*t[2]) * 54 * 2);
            sp_3072_mont_sqr_54(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                    sizeof(*t[2]) * 54 * 2);
        }

        sp_3072_mont_reduce_54(t[0], m, mp);
        n = sp_3072_cmp_54(t[0], m);
        sp_3072_cond_sub_54(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(*r) * 54 * 2);

    }

    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#elif defined(WOLFSSL_SP_CACHE_RESISTANT)
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[3][108];
#else
    sp_digit* td;
    sp_digit* t[3];
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(*td) * 3 * 54 * 2, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        t[0] = td;
        t[1] = &td[54 * 2];
        t[2] = &td[2 * 54 * 2];
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_54(norm, m);

        if (reduceA) {
            err = sp_3072_mod_54(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_54(t[1], t[1], norm);
                err = sp_3072_mod_54(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_54(t[1], a, norm);
            err = sp_3072_mod_54(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 57;
        c = bits % 57;
        n = e[i--] << (57 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = e[i--];
                c = 57;
            }

            y = (n >> 56) & 1;
            n <<= 1;

            sp_3072_mont_mul_54(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                 ((size_t)t[1] & addr_mask[y])), sizeof(t[2]));
            sp_3072_mont_sqr_54(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                           ((size_t)t[1] & addr_mask[y])), t[2], sizeof(t[2]));
        }

        sp_3072_mont_reduce_54(t[0], m, mp);
        n = sp_3072_cmp_54(t[0], m);
        sp_3072_cond_sub_54(t[0], t[0], m, (n < 0) - 1);
        XMEMCPY(r, t[0], sizeof(t[0]));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][108];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit rt[108];
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 108, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 108;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_54(norm, m);

        if (reduceA) {
            err = sp_3072_mod_54(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_54(t[1], t[1], norm);
                err = sp_3072_mod_54(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_54(t[1], a, norm);
            err = sp_3072_mod_54(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_54(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_54(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_54(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_54(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_54(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_54(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_54(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_54(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_54(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_54(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_54(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_54(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_54(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_54(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_54(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_54(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_54(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_54(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_54(t[20], t[10], m, mp);
        sp_3072_mont_mul_54(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_54(t[22], t[11], m, mp);
        sp_3072_mont_mul_54(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_54(t[24], t[12], m, mp);
        sp_3072_mont_mul_54(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_54(t[26], t[13], m, mp);
        sp_3072_mont_mul_54(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_54(t[28], t[14], m, mp);
        sp_3072_mont_mul_54(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_54(t[30], t[15], m, mp);
        sp_3072_mont_mul_54(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 56) / 57) - 1;
        c = bits % 57;
        if (c == 0)
            c = 57;
        if (i < 54)
            n = e[i--] << (64 - c);
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (7 - c);
            c += 57;
        }
        y = n >> 59;
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(rt));
        for (; i>=0 || c>=5; ) {
            if (c < 5) {
                n |= e[i--] << (7 - c);
                c += 57;
            }
            y = (n >> 59) & 0x1f;
            n <<= 5;
            c -= 5;

            sp_3072_mont_sqr_54(rt, rt, m, mp);
            sp_3072_mont_sqr_54(rt, rt, m, mp);
            sp_3072_mont_sqr_54(rt, rt, m, mp);
            sp_3072_mont_sqr_54(rt, rt, m, mp);
            sp_3072_mont_sqr_54(rt, rt, m, mp);

            sp_3072_mont_mul_54(rt, rt, t[y], m, mp);
        }

        sp_3072_mont_reduce_54(rt, m, mp);
        n = sp_3072_cmp_54(rt, m);
        sp_3072_cond_sub_54(rt, rt, m, (n < 0) - 1);
        XMEMCPY(r, rt, sizeof(rt));
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_27(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<27; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
    r[24] = a[24] & m;
    r[25] = a[25] & m;
    r[26] = a[26] & m;
#endif
}

#endif
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_3072(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
    byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* d;
    sp_digit* a;
    sp_digit* m;
    sp_digit* r;
    sp_digit* norm;
    sp_digit e[1];
    sp_digit mp;
    int i;
    int err = MP_OKAY;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 57 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 54 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 54 * 2;
        m = r + 54 * 2;
        norm = r;

        sp_3072_from_bin(a, 54, in, inLen);
#if DIGIT_BIT >= 57
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 54, mm);

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_54(norm, m);
    }
    if (err == MP_OKAY) {
        sp_3072_mul_54(a, a, norm);
        err = sp_3072_mod_54(a, a, m);
    }
    if (err == MP_OKAY) {
        for (i=56; i>=0; i--)
            if (e[0] >> i)
                break;

        XMEMCPY(r, a, sizeof(sp_digit) * 54 * 2);
        for (i--; i>=0; i--) {
            sp_3072_mont_sqr_54(r, r, m, mp);

            if (((e[0] >> i) & 1) == 1)
                sp_3072_mont_mul_54(r, r, a, m, mp);
        }
        sp_3072_mont_reduce_54(r, m, mp);
        mp = sp_3072_cmp_54(r, m);
        sp_3072_cond_sub_54(r, r, m, (mp < 0) - 1);

        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);

    return err;
#else
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[108], md[54], rd[108];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* a;
    sp_digit* m;
    sp_digit* r;
    sp_digit e[1];
    int err = MP_OKAY;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 57 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 54 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 54 * 2;
        m = r + 54 * 2;
    }
#else
    a = ad;
    m = md;
    r = rd;
#endif

    if (err == MP_OKAY) {
        sp_3072_from_bin(a, 54, in, inLen);
#if DIGIT_BIT >= 57
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 54, mm);

        if (e[0] == 0x3) {
            if (err == MP_OKAY) {
                sp_3072_sqr_54(r, a);
                err = sp_3072_mod_54(r, r, m);
            }
            if (err == MP_OKAY) {
                sp_3072_mul_54(r, a, r);
                err = sp_3072_mod_54(r, r, m);
            }
        }
        else {
            sp_digit* norm = r;
            int i;
            sp_digit mp;

            sp_3072_mont_setup(m, &mp);
            sp_3072_mont_norm_54(norm, m);

            if (err == MP_OKAY) {
                sp_3072_mul_54(a, a, norm);
                err = sp_3072_mod_54(a, a, m);
            }

            if (err == MP_OKAY) {
                for (i=56; i>=0; i--)
                    if (e[0] >> i)
                        break;

                XMEMCPY(r, a, sizeof(sp_digit) * 108);
                for (i--; i>=0; i--) {
                    sp_3072_mont_sqr_54(r, r, m, mp);

                    if (((e[0] >> i) & 1) == 1)
                        sp_3072_mont_mul_54(r, r, a, m, mp);
                }
                sp_3072_mont_reduce_54(r, m, mp);
                mp = sp_3072_cmp_54(r, m);
                sp_3072_cond_sub_54(r, r, m, (mp < 0) - 1);
            }
        }
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif /* WOLFSSL_SP_SMALL */
}

/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_3072(const byte* in, word32 inLen, mp_int* dm,
    mp_int* pm, mp_int* qm, mp_int* dpm, mp_int* dqm, mp_int* qim, mp_int* mm,
    byte* out, word32* outLen)
{
#ifdef SP_RSA_PRIVATE_EXP_D
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* a;
    sp_digit* d = NULL;
    sp_digit* m;
    sp_digit* r;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(dm) > 3072 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 54 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = d + 54;
        m = a + 54;
        r = a;

        sp_3072_from_bin(a, 54, in, inLen);
        sp_3072_from_mp(d, 54, dm);
        sp_3072_from_mp(m, 54, mm);
        err = sp_3072_mod_exp_54(r, a, d, 3072, m, 0);
    }
    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    if (d != NULL) {
        XMEMSET(d, 0, sizeof(sp_digit) * 54);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }

    return err;
#else
    sp_digit a[108], d[54], m[54];
    sp_digit* r = a;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(dm) > 3072 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        sp_3072_from_bin(a, 54, in, inLen);
        sp_3072_from_mp(d, 54, dm);
        sp_3072_from_mp(m, 54, mm);
        err = sp_3072_mod_exp_54(r, a, d, 3072, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    XMEMSET(d, 0, sizeof(sp_digit) * 54);

    return err;
#endif /* WOLFSSL_SP_SMALL || defined(WOLFSSL_SMALL_STACK) */
#else
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    sp_digit* t = NULL;
    sp_digit* a;
    sp_digit* p;
    sp_digit* q;
    sp_digit* dp;
    sp_digit* dq;
    sp_digit* qi;
    sp_digit* tmp;
    sp_digit* tmpa;
    sp_digit* tmpb;
    sp_digit* r;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 384 || mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 27 * 11, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (t == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = t;
        p = a + 54 * 2;
        q = p + 27;
        qi = dq = dp = q + 27;
        tmpa = qi + 27;
        tmpb = tmpa + 54;

        tmp = t;
        r = tmp + 54;

        sp_3072_from_bin(a, 54, in, inLen);
        sp_3072_from_mp(p, 27, pm);
        sp_3072_from_mp(q, 27, qm);
        sp_3072_from_mp(dp, 27, dpm);
        err = sp_3072_mod_exp_27(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(dq, 27, dqm);
        err = sp_3072_mod_exp_27(tmpb, a, dq, 1536, q, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_sub_27(tmpa, tmpa, tmpb);
        sp_3072_mask_27(tmp, p, tmpa[26] >> 63);
        sp_3072_add_27(tmpa, tmpa, tmp);

        sp_3072_from_mp(qi, 27, qim);
        sp_3072_mul_27(tmpa, tmpa, qi);
        err = sp_3072_mod_27(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_3072_mul_27(tmpa, q, tmpa);
        sp_3072_add_54(r, tmpb, tmpa);
        sp_3072_norm_54(r);

        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    if (t != NULL) {
        XMEMSET(t, 0, sizeof(sp_digit) * 27 * 11);
        XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }

    return err;
#else
    sp_digit a[54 * 2];
    sp_digit p[27], q[27], dp[27], dq[27], qi[27];
    sp_digit tmp[54], tmpa[54], tmpb[54];
    sp_digit* r = a;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 384 || mp_count_bits(mm) != 3072))
        err = MP_READ_E;

    if (err == MP_OKAY) {
        sp_3072_from_bin(a, 54, in, inLen);
        sp_3072_from_mp(p, 27, pm);
        sp_3072_from_mp(q, 27, qm);
        sp_3072_from_mp(dp, 27, dpm);
        sp_3072_from_mp(dq, 27, dqm);
        sp_3072_from_mp(qi, 27, qim);

        err = sp_3072_mod_exp_27(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY)
        err = sp_3072_mod_exp_27(tmpb, a, dq, 1536, q, 1);

    if (err == MP_OKAY) {
        sp_3072_sub_27(tmpa, tmpa, tmpb);
        sp_3072_mask_27(tmp, p, tmpa[26] >> 63);
        sp_3072_add_27(tmpa, tmpa, tmp);
        sp_3072_mul_27(tmpa, tmpa, qi);
        err = sp_3072_mod_27(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_3072_mul_27(tmpa, tmpa, q);
        sp_3072_add_54(r, tmpb, tmpa);
        sp_3072_norm_54(r);

        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

    XMEMSET(tmpa, 0, sizeof(tmpa));
    XMEMSET(tmpb, 0, sizeof(tmpb));
    XMEMSET(p, 0, sizeof(p));
    XMEMSET(q, 0, sizeof(q));
    XMEMSET(dp, 0, sizeof(dp));
    XMEMSET(dq, 0, sizeof(dq));
    XMEMSET(qi, 0, sizeof(qi));

    return err;
#endif /* WOLFSSL_SP_SMALL || defined(WOLFSSL_SMALL_STACK) */
#endif /* SP_RSA_PRIVATE_EXP_D */
}

#endif /* WOLFSSL_HAVE_SP_RSA */
#ifdef WOLFSSL_HAVE_SP_DH
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_3072_to_mp(sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (3072 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) {
#if DIGIT_BIT == 57
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 54);
        r->used = 54;
        mp_clamp(r);
#elif DIGIT_BIT < 57
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 54; i++) {
            r->dp[j] |= a[i] << s;
            r->dp[j] &= (1l << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = a[i] >> s;
            while (s + DIGIT_BIT <= 57) {
                s += DIGIT_BIT;
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
                r->dp[++j] = a[i] >> s;
            }
            s = 57 - s;
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 54; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 57 >= DIGIT_BIT) {
    #if DIGIT_BIT < 64
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 57 - s;
            }
            else
                s += 57;
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
    sp_digit* d = NULL;
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072 || expBits > 3072 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 54 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 54 * 2;
        m = e + 54;
        r = b;

        sp_3072_from_mp(b, 54, base);
        sp_3072_from_mp(e, 54, exp);
        sp_3072_from_mp(m, 54, mod);

        err = sp_3072_mod_exp_54(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }

    if (d != NULL) {
        XMEMSET(e, 0, sizeof(sp_digit) * 54);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit bd[108], ed[54], md[54];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072 || expBits > 3072 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

#ifdef WOLFSSL_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 54 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 54 * 2;
        m = e + 54;
        r = b;
    }
#else
    r = b = bd;
    e = ed;
    m = md;
#endif

    if (err == MP_OKAY) {
        sp_3072_from_mp(b, 54, base);
        sp_3072_from_mp(e, 54, exp);
        sp_3072_from_mp(m, 54, mod);

        err = sp_3072_mod_exp_54(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(sp_digit) * 54);

#ifdef WOLFSSL_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 384 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen,
    mp_int* mod, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
    sp_digit* d = NULL;
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    word32 i;

    if (mp_count_bits(base) > 3072 || expLen > 384 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 54 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 54 * 2;
        m = e + 54;
        r = b;

        sp_3072_from_mp(b, 54, base);
        sp_3072_from_bin(e, 54, exp, expLen);
        sp_3072_from_mp(m, 54, mod);

        err = sp_3072_mod_exp_54(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
        for (i=0; i<384 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    if (d != NULL) {
        XMEMSET(e, 0, sizeof(sp_digit) * 54);
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
    return err;
#else
#ifndef WOLFSSL_SMALL_STACK
    sp_digit bd[108], ed[54], md[54];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* b;
    sp_digit* e;
    sp_digit* m;
    sp_digit* r;
    word32 i;
    int err = MP_OKAY;

    if (mp_count_bits(base) > 3072 || expLen > 384 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

#ifdef WOLFSSL_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(*d) * 54 * 4, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        b = d;
        e = b + 54 * 2;
        m = e + 54;
        r = b;
    }
#else
    r = b = bd;
    e = ed;
    m = md;
#endif

    if (err == MP_OKAY) {
        sp_3072_from_mp(b, 54, base);
        sp_3072_from_bin(e, 54, exp, expLen);
        sp_3072_from_mp(m, 54, mod);

        err = sp_3072_mod_exp_54(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
        for (i=0; i<384 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    XMEMSET(e, 0, sizeof(sp_digit) * 54);

#ifdef WOLFSSL_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_SP_NO_3072 */
#endif /* SP_WORD_SIZE == 64 */

#endif
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
#if SP_WORD_SIZE == 64
#ifndef WOLFSSL_SP_NO_2048
/* Read big endian unsigned byte aray into r.
 *
 * r  A single precision integer.
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_2048_from_bin(sp_digit* r, int max, const byte* a, int n)
{
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= ((sp_digit)a[i]) << s;
        if (s >= 56) {
            r[j] &= 0xffffffffffffffffl;
            s = 64 - s;
            if (j + 1 >= max)
                break;
            r[++j] = a[i] >> s;
            s = 8 - s;
        }
        else
            s += 8;
    }

    for (j++; j < max; j++)
        r[j] = 0;
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * a  A multi-precision integer.
 */
static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a)
{
#if DIGIT_BIT == 64
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < max; j++)
        r[j] = 0;
#elif DIGIT_BIT > 64
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= a->dp[i] << s;
        r[j] &= 0xffffffffffffffffl;
        s = 64 - s;
        if (j + 1 >= max)
            break;
        r[++j] = a->dp[i] >> s;
        while (s + 64 <= DIGIT_BIT) {
            s += 64;
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            if (s < DIGIT_BIT)
                r[++j] = a->dp[i] >> s;
            else
                r[++j] = 0;
        }
        s = DIGIT_BIT - s;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#else
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 64) {
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            s = 64 - s;
            r[++j] = a->dp[i] >> s;
            s = DIGIT_BIT - s;
        }
        else
            s += DIGIT_BIT;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#endif
}

/* Write r as big endian to byte aray.
 * Fixed length number of bytes written: 256
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_2048_to_bin(sp_digit* r, byte* a)
{
    int i, j, s = 0, b;

    j = 2048 / 8 - 1;
    a[j] = 0;
    for (i=0; i<32 && j>=0; i++) {
        b = 0;
        a[j--] |= r[i] << s; b += 8 - s;
        if (j < 0)
            break;
        while (b < 64) {
            a[j--] = r[i] >> b; b += 8;
            if (j < 0)
                break;
        }
        if (j < 0)
            break;
        s = 8 - (b - 64);
        a[j] = 0;
        if (s != 0)
            j++;
    }
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    sp_digit tmp[16];

    __asm__ __volatile__ (
        "#  A[0] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "movq	%%rax, (%[tmp])\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "#  A[0] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 8(%[tmp])\n\t"
        "#  A[0] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 16(%[tmp])\n\t"
        "#  A[0] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 24(%[tmp])\n\t"
        "#  A[0] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 32(%[tmp])\n\t"
        "#  A[0] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 40(%[tmp])\n\t"
        "#  A[0] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 48(%[tmp])\n\t"
        "#  A[0] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 56(%[tmp])\n\t"
        "#  A[0] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 64(%[tmp])\n\t"
        "#  A[0] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 72(%[tmp])\n\t"
        "#  A[0] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 80(%[tmp])\n\t"
        "#  A[0] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 88(%[tmp])\n\t"
        "#  A[0] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 96(%[tmp])\n\t"
        "#  A[0] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 104(%[tmp])\n\t"
        "#  A[0] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 112(%[tmp])\n\t"
        "#  A[0] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 120(%[tmp])\n\t"
        "#  A[1] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 128(%[r])\n\t"
        "#  A[2] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 136(%[r])\n\t"
        "#  A[3] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 144(%[r])\n\t"
        "#  A[4] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "#  A[5] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "#  A[6] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 168(%[r])\n\t"
        "#  A[7] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 176(%[r])\n\t"
        "#  A[8] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 184(%[r])\n\t"
        "#  A[9] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 192(%[r])\n\t"
        "#  A[10] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "#  A[11] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "#  A[12] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 216(%[r])\n\t"
        "#  A[13] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 224(%[r])\n\t"
        "#  A[14] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 232(%[r])\n\t"
        "#  A[15] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 240(%[r])\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a)
{
    sp_digit tmp[16];

    __asm__ __volatile__ (
        "#  A[0] * A[0]\n\t"
        "movq	0(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%rax, (%[tmp])\n\t"
        "movq	%%rdx, %%r8\n\t"
        "#  A[0] * A[1]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 8(%[tmp])\n\t"
        "#  A[0] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * A[1]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 16(%[tmp])\n\t"
        "#  A[0] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[1] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 24(%[tmp])\n\t"
        "#  A[0] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 32(%[tmp])\n\t"
        "#  A[0] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 40(%[tmp])\n\t"
        "#  A[0] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 48(%[tmp])\n\t"
        "#  A[0] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 56(%[tmp])\n\t"
        "#  A[0] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 64(%[tmp])\n\t"
        "#  A[0] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 72(%[tmp])\n\t"
        "#  A[0] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 80(%[tmp])\n\t"
        "#  A[0] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 88(%[tmp])\n\t"
        "#  A[0] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 96(%[tmp])\n\t"
        "#  A[0] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 104(%[tmp])\n\t"
        "#  A[0] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 112(%[tmp])\n\t"
        "#  A[0] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 120(%[tmp])\n\t"
        "#  A[1] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[2] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 128(%[r])\n\t"
        "#  A[2] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[3] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 136(%[r])\n\t"
        "#  A[3] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[4] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 144(%[r])\n\t"
        "#  A[4] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[5] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 152(%[r])\n\t"
        "#  A[5] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[6] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 160(%[r])\n\t"
        "#  A[6] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[7] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 168(%[r])\n\t"
        "#  A[7] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[8] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 176(%[r])\n\t"
        "#  A[8] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[9] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 184(%[r])\n\t"
        "#  A[9] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[10] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 192(%[r])\n\t"
        "#  A[10] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[11] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 200(%[r])\n\t"
        "#  A[11] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 208(%[r])\n\t"
        "#  A[12] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[13] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 216(%[r])\n\t"
        "#  A[13] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 224(%[r])\n\t"
        "#  A[14] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 232(%[r])\n\t"
        "#  A[15] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "movq	%%rcx, 240(%[r])\n\t"
        "movq	%%r8, 248(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

/* Multiply a and b into r. (r = a * b)
 *
 * r   Result of multiplication.
 * a   First number to multiply.
 * b   Second number to multiply.
 */
static void sp_2048_mul_avx2_16(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit tmp[2*16];

    __asm__ __volatile__ (
        "movq	0(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "# A[0] * B[0]\n\t"
        "mulx	0(%[b]), %%r10, %%r11\n\t"
        "# A[0] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "# A[0] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "movq	%%r10, 0(%[t])\n\t"
        "movq	%%r11, 8(%[t])\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "# A[0] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "# A[0] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "# A[0] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "# A[0] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "# A[0] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "# A[0] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adcxq	%%r15, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	8(%[t]), %%r11\n\t"
        "movq	16(%[t]), %%r12\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "# A[1] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[1] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 8(%[t])\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "# A[1] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[1] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[1] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[1] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[1] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[1] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	16(%[t]), %%r12\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "# A[2] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[2] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "# A[2] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[2] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[2] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[2] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[2] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[2] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "# A[3] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[3] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "# A[3] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[3] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[3] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[3] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[3] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[3] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "# A[4] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[4] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "# A[4] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[4] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[4] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[4] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[4] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[4] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "# A[5] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[5] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[5] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[5] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[5] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[5] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[5] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[5] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "# A[6] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[6] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[6] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[6] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[6] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[6] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[6] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[6] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "# A[7] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[7] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[7] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[7] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[7] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[7] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[7] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[7] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "# A[8] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[8] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[8] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[8] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[8] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[8] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[8] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[8] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	72(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[9] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[9] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[9] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[9] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[9] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[9] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[9] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[9] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[10] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[10] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[10] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[10] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[10] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[10] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[10] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[10] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[11] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[11] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[11] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[11] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[11] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[11] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[11] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[11] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[12] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[12] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[12] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[12] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[12] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[12] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[12] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[12] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[13] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[13] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[13] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[13] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[13] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[13] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[13] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[13] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[14] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[14] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[14] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[14] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[14] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[14] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[14] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[14] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[15] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[15] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[15] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[15] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[15] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[15] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[15] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[15] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        :
        : [a] "r" (a), [b] "r" (b), [t] "r" (tmp)
        : "memory", "rax", "rdx", "rcx",
          "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_avx2_16(sp_digit* r, const sp_digit* a)
{
    sp_digit tmp[32];

    __asm__ __volatile__ (
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 1\n\t"
        "xorq	%%r10, %%r10\n\t"
        "xorq	%%r11, %%r11\n\t"
        "xorq	%%r12, %%r12\n\t"
        "xorq	%%r13, %%r13\n\t"
        "xorq	%%r14, %%r14\n\t"
        "xorq	%%r15, %%r15\n\t"
        "# A[1] x A[0]\n\t"
        "movq	0(%[a]), %%rdx\n\t"
        "mulxq	8(%[a]), %%r10, %%r11\n\t"
        "# A[2] x A[0]\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[3] x A[0]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[4] x A[0]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[5] x A[0]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 8(%[tmp])\n\t"
        "movq	%%r11, 16(%[tmp])\n\t"
        "movq	%%r12, 24(%[tmp])\n\t"
        "movq	%%r13, 32(%[tmp])\n\t"
        "movq	%%r14, 40(%[tmp])\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "# A[6] x A[0]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[7] x A[0]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[0]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[9] x A[0]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[0]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 48(%[tmp])\n\t"
        "movq	%%r10, 56(%[tmp])\n\t"
        "movq	%%r11, 64(%[tmp])\n\t"
        "movq	%%r12, 72(%[tmp])\n\t"
        "movq	%%r13, 80(%[tmp])\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[11] x A[0]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[0]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[0]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[0]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[0]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 88(%[tmp])\n\t"
        "movq	%%r15, 96(%[tmp])\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 2\n\t"
        "movq	24(%[tmp]), %%r13\n\t"
        "movq	32(%[tmp]), %%r14\n\t"
        "movq	40(%[tmp]), %%r15\n\t"
        "movq	48(%[tmp]), %%r10\n\t"
        "movq	56(%[tmp]), %%r11\n\t"
        "movq	64(%[tmp]), %%r12\n\t"
        "# A[2] x A[1]\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[3] x A[1]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[4] x A[1]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[5] x A[1]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[6] x A[1]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 24(%[tmp])\n\t"
        "movq	%%r14, 32(%[tmp])\n\t"
        "movq	%%r15, 40(%[tmp])\n\t"
        "movq	%%r10, 48(%[tmp])\n\t"
        "movq	%%r11, 56(%[tmp])\n\t"
        "movq	72(%[tmp]), %%r13\n\t"
        "movq	80(%[tmp]), %%r14\n\t"
        "movq	88(%[tmp]), %%r15\n\t"
        "movq	96(%[tmp]), %%r10\n\t"
        "movq	104(%[tmp]), %%r11\n\t"
        "# A[7] x A[1]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[8] x A[1]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[1]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[10] x A[1]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[11] x A[1]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 64(%[tmp])\n\t"
        "movq	%%r13, 72(%[tmp])\n\t"
        "movq	%%r14, 80(%[tmp])\n\t"
        "movq	%%r15, 88(%[tmp])\n\t"
        "movq	%%r10, 96(%[tmp])\n\t"
        "movq	112(%[tmp]), %%r12\n\t"
        "movq	120(%[tmp]), %%r13\n\t"
        "movq	128(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[12] x A[1]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[1]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[14] x A[1]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[1]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[15] x A[2]\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 104(%[tmp])\n\t"
        "movq	%%r12, 112(%[tmp])\n\t"
        "movq	%%r13, 120(%[tmp])\n\t"
        "movq	%%r14, 128(%[tmp])\n\t"
        "movq	%%r15, 136(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 144(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 3\n\t"
        "movq	40(%[tmp]), %%r10\n\t"
        "movq	48(%[tmp]), %%r11\n\t"
        "movq	56(%[tmp]), %%r12\n\t"
        "movq	64(%[tmp]), %%r13\n\t"
        "movq	72(%[tmp]), %%r14\n\t"
        "movq	80(%[tmp]), %%r15\n\t"
        "# A[3] x A[2]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[4] x A[2]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[5] x A[2]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[6] x A[2]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[7] x A[2]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 40(%[tmp])\n\t"
        "movq	%%r11, 48(%[tmp])\n\t"
        "movq	%%r12, 56(%[tmp])\n\t"
        "movq	%%r13, 64(%[tmp])\n\t"
        "movq	%%r14, 72(%[tmp])\n\t"
        "movq	88(%[tmp]), %%r10\n\t"
        "movq	96(%[tmp]), %%r11\n\t"
        "movq	104(%[tmp]), %%r12\n\t"
        "movq	112(%[tmp]), %%r13\n\t"
        "movq	120(%[tmp]), %%r14\n\t"
        "# A[8] x A[2]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[2]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[10] x A[2]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[11] x A[2]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[12] x A[2]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 80(%[tmp])\n\t"
        "movq	%%r10, 88(%[tmp])\n\t"
        "movq	%%r11, 96(%[tmp])\n\t"
        "movq	%%r12, 104(%[tmp])\n\t"
        "movq	%%r13, 112(%[tmp])\n\t"
        "movq	128(%[tmp]), %%r15\n\t"
        "movq	136(%[tmp]), %%r10\n\t"
        "movq	144(%[tmp]), %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[13] x A[2]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[2]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[14] x A[3]\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[4]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[14] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 120(%[tmp])\n\t"
        "movq	%%r15, 128(%[tmp])\n\t"
        "movq	%%r10, 136(%[tmp])\n\t"
        "movq	%%r11, 144(%[tmp])\n\t"
        "movq	%%r12, 152(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 160(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 4\n\t"
        "movq	56(%[tmp]), %%r13\n\t"
        "movq	64(%[tmp]), %%r14\n\t"
        "movq	72(%[tmp]), %%r15\n\t"
        "movq	80(%[tmp]), %%r10\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "movq	96(%[tmp]), %%r12\n\t"
        "# A[4] x A[3]\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[5] x A[3]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[6] x A[3]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[7] x A[3]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[3]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 56(%[tmp])\n\t"
        "movq	%%r14, 64(%[tmp])\n\t"
        "movq	%%r15, 72(%[tmp])\n\t"
        "movq	%%r10, 80(%[tmp])\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	104(%[tmp]), %%r13\n\t"
        "movq	112(%[tmp]), %%r14\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "# A[9] x A[3]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[3]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[11] x A[3]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[3]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[3]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 96(%[tmp])\n\t"
        "movq	%%r13, 104(%[tmp])\n\t"
        "movq	%%r14, 112(%[tmp])\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[13] x A[4]\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[13] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[13] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[13] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 5\n\t"
        "movq	72(%[tmp]), %%r10\n\t"
        "movq	80(%[tmp]), %%r11\n\t"
        "movq	88(%[tmp]), %%r12\n\t"
        "movq	96(%[tmp]), %%r13\n\t"
        "movq	104(%[tmp]), %%r14\n\t"
        "movq	112(%[tmp]), %%r15\n\t"
        "# A[5] x A[4]\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[6] x A[4]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[7] x A[4]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[8] x A[4]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[4]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 72(%[tmp])\n\t"
        "movq	%%r11, 80(%[tmp])\n\t"
        "movq	%%r12, 88(%[tmp])\n\t"
        "movq	%%r13, 96(%[tmp])\n\t"
        "movq	%%r14, 104(%[tmp])\n\t"
        "movq	120(%[tmp]), %%r10\n\t"
        "movq	128(%[tmp]), %%r11\n\t"
        "movq	136(%[tmp]), %%r12\n\t"
        "movq	144(%[tmp]), %%r13\n\t"
        "movq	152(%[tmp]), %%r14\n\t"
        "# A[10] x A[4]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[11] x A[4]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[12] x A[4]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[12] x A[5]\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[12] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 112(%[tmp])\n\t"
        "movq	%%r10, 120(%[tmp])\n\t"
        "movq	%%r11, 128(%[tmp])\n\t"
        "movq	%%r12, 136(%[tmp])\n\t"
        "movq	%%r13, 144(%[tmp])\n\t"
        "movq	160(%[tmp]), %%r15\n\t"
        "movq	168(%[tmp]), %%r10\n\t"
        "movq	176(%[tmp]), %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[12] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[12] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[12] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[12] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 152(%[tmp])\n\t"
        "movq	%%r15, 160(%[tmp])\n\t"
        "movq	%%r10, 168(%[tmp])\n\t"
        "movq	%%r11, 176(%[tmp])\n\t"
        "movq	%%r12, 184(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 192(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 6\n\t"
        "movq	88(%[tmp]), %%r13\n\t"
        "movq	96(%[tmp]), %%r14\n\t"
        "movq	104(%[tmp]), %%r15\n\t"
        "movq	112(%[tmp]), %%r10\n\t"
        "movq	120(%[tmp]), %%r11\n\t"
        "movq	128(%[tmp]), %%r12\n\t"
        "# A[6] x A[5]\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[7] x A[5]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[8] x A[5]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[5]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[10] x A[5]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 88(%[tmp])\n\t"
        "movq	%%r14, 96(%[tmp])\n\t"
        "movq	%%r15, 104(%[tmp])\n\t"
        "movq	%%r10, 112(%[tmp])\n\t"
        "movq	%%r11, 120(%[tmp])\n\t"
        "movq	136(%[tmp]), %%r13\n\t"
        "movq	144(%[tmp]), %%r14\n\t"
        "movq	152(%[tmp]), %%r15\n\t"
        "movq	160(%[tmp]), %%r10\n\t"
        "movq	168(%[tmp]), %%r11\n\t"
        "# A[11] x A[5]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[11] x A[6]\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[11] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[11] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[11] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 128(%[tmp])\n\t"
        "movq	%%r13, 136(%[tmp])\n\t"
        "movq	%%r14, 144(%[tmp])\n\t"
        "movq	%%r15, 152(%[tmp])\n\t"
        "movq	%%r10, 160(%[tmp])\n\t"
        "movq	176(%[tmp]), %%r12\n\t"
        "movq	184(%[tmp]), %%r13\n\t"
        "movq	192(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[11] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[9]\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[13] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[13] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[13] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 168(%[tmp])\n\t"
        "movq	%%r12, 176(%[tmp])\n\t"
        "movq	%%r13, 184(%[tmp])\n\t"
        "movq	%%r14, 192(%[tmp])\n\t"
        "movq	%%r15, 200(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 208(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 7\n\t"
        "movq	104(%[tmp]), %%r10\n\t"
        "movq	112(%[tmp]), %%r11\n\t"
        "movq	120(%[tmp]), %%r12\n\t"
        "movq	128(%[tmp]), %%r13\n\t"
        "movq	136(%[tmp]), %%r14\n\t"
        "movq	144(%[tmp]), %%r15\n\t"
        "# A[7] x A[6]\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[6]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[9] x A[6]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[6]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[10] x A[7]\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	152(%[tmp]), %%r10\n\t"
        "movq	160(%[tmp]), %%r11\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "# A[10] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[10] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[6]\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[14] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[14] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[14] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[14] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[14] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 8\n\t"
        "movq	120(%[tmp]), %%r13\n\t"
        "movq	128(%[tmp]), %%r14\n\t"
        "movq	136(%[tmp]), %%r15\n\t"
        "movq	144(%[tmp]), %%r10\n\t"
        "movq	152(%[tmp]), %%r11\n\t"
        "movq	160(%[tmp]), %%r12\n\t"
        "# A[8] x A[7]\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[7]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[9] x A[8]\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[15] x A[3]\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[15] x A[4]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 120(%[tmp])\n\t"
        "movq	%%r14, 128(%[tmp])\n\t"
        "movq	%%r15, 136(%[tmp])\n\t"
        "movq	%%r10, 144(%[tmp])\n\t"
        "movq	%%r11, 152(%[tmp])\n\t"
        "movq	168(%[tmp]), %%r13\n\t"
        "movq	176(%[tmp]), %%r14\n\t"
        "movq	184(%[tmp]), %%r15\n\t"
        "movq	192(%[tmp]), %%r10\n\t"
        "movq	200(%[tmp]), %%r11\n\t"
        "# A[15] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[15] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[15] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[15] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 160(%[tmp])\n\t"
        "movq	%%r13, 168(%[tmp])\n\t"
        "movq	%%r14, 176(%[tmp])\n\t"
        "movq	%%r15, 184(%[tmp])\n\t"
        "movq	%%r10, 192(%[tmp])\n\t"
        "movq	208(%[tmp]), %%r12\n\t"
        "movq	216(%[tmp]), %%r13\n\t"
        "movq	224(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[15] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[15] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[15] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 200(%[tmp])\n\t"
        "movq	%%r12, 208(%[tmp])\n\t"
        "movq	%%r13, 216(%[tmp])\n\t"
        "movq	%%r14, 224(%[tmp])\n\t"
        "movq	%%r15, 232(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 240(%[tmp])\n\t"
        "movq	%%r9, 248(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Double and Add in A[i] x A[i]\n\t"
        "movq	8(%[tmp]), %%r11\n\t"
        "# A[0] x A[0]\n\t"
        "movq	0(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "movq	%%rax, 0(%[tmp])\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r11, 8(%[tmp])\n\t"
        "movq	16(%[tmp]), %%r10\n\t"
        "movq	24(%[tmp]), %%r11\n\t"
        "# A[1] x A[1]\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 16(%[tmp])\n\t"
        "movq	%%r11, 24(%[tmp])\n\t"
        "movq	32(%[tmp]), %%r10\n\t"
        "movq	40(%[tmp]), %%r11\n\t"
        "# A[2] x A[2]\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 32(%[tmp])\n\t"
        "movq	%%r11, 40(%[tmp])\n\t"
        "movq	48(%[tmp]), %%r10\n\t"
        "movq	56(%[tmp]), %%r11\n\t"
        "# A[3] x A[3]\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 48(%[tmp])\n\t"
        "movq	%%r11, 56(%[tmp])\n\t"
        "movq	64(%[tmp]), %%r10\n\t"
        "movq	72(%[tmp]), %%r11\n\t"
        "# A[4] x A[4]\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 64(%[tmp])\n\t"
        "movq	%%r11, 72(%[tmp])\n\t"
        "movq	80(%[tmp]), %%r10\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "# A[5] x A[5]\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 80(%[tmp])\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	96(%[tmp]), %%r10\n\t"
        "movq	104(%[tmp]), %%r11\n\t"
        "# A[6] x A[6]\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 96(%[tmp])\n\t"
        "movq	%%r11, 104(%[tmp])\n\t"
        "movq	112(%[tmp]), %%r10\n\t"
        "movq	120(%[tmp]), %%r11\n\t"
        "# A[7] x A[7]\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 112(%[tmp])\n\t"
        "movq	%%r11, 120(%[tmp])\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "# A[8] x A[8]\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 128(%[r])\n\t"
        "movq	%%r11, 136(%[r])\n\t"
        "movq	144(%[tmp]), %%r10\n\t"
        "movq	152(%[tmp]), %%r11\n\t"
        "# A[9] x A[9]\n\t"
        "movq	72(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 144(%[r])\n\t"
        "movq	%%r11, 152(%[r])\n\t"
        "movq	160(%[tmp]), %%r10\n\t"
        "movq	168(%[tmp]), %%r11\n\t"
        "# A[10] x A[10]\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 160(%[r])\n\t"
        "movq	%%r11, 168(%[r])\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "# A[11] x A[11]\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 176(%[r])\n\t"
        "movq	%%r11, 184(%[r])\n\t"
        "movq	192(%[tmp]), %%r10\n\t"
        "movq	200(%[tmp]), %%r11\n\t"
        "# A[12] x A[12]\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 192(%[r])\n\t"
        "movq	%%r11, 200(%[r])\n\t"
        "movq	208(%[tmp]), %%r10\n\t"
        "movq	216(%[tmp]), %%r11\n\t"
        "# A[13] x A[13]\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 208(%[r])\n\t"
        "movq	%%r11, 216(%[r])\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "movq	232(%[tmp]), %%r11\n\t"
        "# A[14] x A[14]\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 224(%[r])\n\t"
        "movq	%%r11, 232(%[r])\n\t"
        "movq	240(%[tmp]), %%r10\n\t"
        "movq	248(%[tmp]), %%r11\n\t"
        "# A[15] x A[15]\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 240(%[r])\n\t"
        "movq	%%r11, 248(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13", "r14", "r15"
    );

    XMEMCPY(r, tmp, sizeof(tmp)/2);
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	(%[a]), %%rax\n\t"
        "addq	(%[b]), %%rax\n\t"
        "movq	%%rax, (%[r])\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "adcq	8(%[b]), %%rax\n\t"
        "movq	%%rax, 8(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "adcq	16(%[b]), %%rax\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "adcq	24(%[b]), %%rax\n\t"
        "movq	%%rax, 24(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "adcq	32(%[b]), %%rax\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "adcq	40(%[b]), %%rax\n\t"
        "movq	%%rax, 40(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "adcq	48(%[b]), %%rax\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "adcq	56(%[b]), %%rax\n\t"
        "movq	%%rax, 56(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "adcq	64(%[b]), %%rax\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "adcq	72(%[b]), %%rax\n\t"
        "movq	%%rax, 72(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "adcq	80(%[b]), %%rax\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "adcq	88(%[b]), %%rax\n\t"
        "movq	%%rax, 88(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "adcq	96(%[b]), %%rax\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "adcq	104(%[b]), %%rax\n\t"
        "movq	%%rax, 104(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "adcq	112(%[b]), %%rax\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "adcq	120(%[b]), %%rax\n\t"
        "movq	%%rax, 120(%[r])\n\t"
        "adcq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax"
    );

    return c;
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[a]), %%r8\n\t"
        "movq	8(%[a]), %%r9\n\t"
        "movq	0(%[b]), %%rdx\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "subq	%%rdx, %%r8\n\t"
        "movq	16(%[b]), %%rdx\n\t"
        "movq	%%r8, 0(%[a])\n\t"
        "movq	16(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "movq	%%r9, 8(%[a])\n\t"
        "movq	24(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	32(%[b]), %%rdx\n\t"
        "movq	%%r8, 16(%[a])\n\t"
        "movq	32(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "movq	%%r9, 24(%[a])\n\t"
        "movq	40(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	48(%[b]), %%rdx\n\t"
        "movq	%%r8, 32(%[a])\n\t"
        "movq	48(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "movq	%%r9, 40(%[a])\n\t"
        "movq	56(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	64(%[b]), %%rdx\n\t"
        "movq	%%r8, 48(%[a])\n\t"
        "movq	64(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "movq	%%r9, 56(%[a])\n\t"
        "movq	72(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	80(%[b]), %%rdx\n\t"
        "movq	%%r8, 64(%[a])\n\t"
        "movq	80(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "movq	%%r9, 72(%[a])\n\t"
        "movq	88(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	96(%[b]), %%rdx\n\t"
        "movq	%%r8, 80(%[a])\n\t"
        "movq	96(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "movq	%%r9, 88(%[a])\n\t"
        "movq	104(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	112(%[b]), %%rdx\n\t"
        "movq	%%r8, 96(%[a])\n\t"
        "movq	112(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "movq	%%r9, 104(%[a])\n\t"
        "movq	120(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	128(%[b]), %%rdx\n\t"
        "movq	%%r8, 112(%[a])\n\t"
        "movq	128(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	136(%[b]), %%rcx\n\t"
        "movq	%%r9, 120(%[a])\n\t"
        "movq	136(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	144(%[b]), %%rdx\n\t"
        "movq	%%r8, 128(%[a])\n\t"
        "movq	144(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	152(%[b]), %%rcx\n\t"
        "movq	%%r9, 136(%[a])\n\t"
        "movq	152(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	160(%[b]), %%rdx\n\t"
        "movq	%%r8, 144(%[a])\n\t"
        "movq	160(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	168(%[b]), %%rcx\n\t"
        "movq	%%r9, 152(%[a])\n\t"
        "movq	168(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	176(%[b]), %%rdx\n\t"
        "movq	%%r8, 160(%[a])\n\t"
        "movq	176(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	184(%[b]), %%rcx\n\t"
        "movq	%%r9, 168(%[a])\n\t"
        "movq	184(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	192(%[b]), %%rdx\n\t"
        "movq	%%r8, 176(%[a])\n\t"
        "movq	192(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	200(%[b]), %%rcx\n\t"
        "movq	%%r9, 184(%[a])\n\t"
        "movq	200(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	208(%[b]), %%rdx\n\t"
        "movq	%%r8, 192(%[a])\n\t"
        "movq	208(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	216(%[b]), %%rcx\n\t"
        "movq	%%r9, 200(%[a])\n\t"
        "movq	216(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	224(%[b]), %%rdx\n\t"
        "movq	%%r8, 208(%[a])\n\t"
        "movq	224(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	232(%[b]), %%rcx\n\t"
        "movq	%%r9, 216(%[a])\n\t"
        "movq	232(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	240(%[b]), %%rdx\n\t"
        "movq	%%r8, 224(%[a])\n\t"
        "movq	240(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	248(%[b]), %%rcx\n\t"
        "movq	%%r9, 232(%[a])\n\t"
        "movq	248(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	%%r8, 240(%[a])\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	%%r9, 248(%[a])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [a] "r" (a), [b] "r" (b)
        : "memory", "rdx", "rcx", "r8", "r9"
    );

    return c;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	(%[a]), %%rax\n\t"
        "addq	(%[b]), %%rax\n\t"
        "movq	%%rax, (%[r])\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "adcq	8(%[b]), %%rax\n\t"
        "movq	%%rax, 8(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "adcq	16(%[b]), %%rax\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "adcq	24(%[b]), %%rax\n\t"
        "movq	%%rax, 24(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "adcq	32(%[b]), %%rax\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "adcq	40(%[b]), %%rax\n\t"
        "movq	%%rax, 40(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "adcq	48(%[b]), %%rax\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "adcq	56(%[b]), %%rax\n\t"
        "movq	%%rax, 56(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "adcq	64(%[b]), %%rax\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "adcq	72(%[b]), %%rax\n\t"
        "movq	%%rax, 72(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "adcq	80(%[b]), %%rax\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "adcq	88(%[b]), %%rax\n\t"
        "movq	%%rax, 88(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "adcq	96(%[b]), %%rax\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "adcq	104(%[b]), %%rax\n\t"
        "movq	%%rax, 104(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "adcq	112(%[b]), %%rax\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "adcq	120(%[b]), %%rax\n\t"
        "movq	%%rax, 120(%[r])\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "adcq	128(%[b]), %%rax\n\t"
        "movq	%%rax, 128(%[r])\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "adcq	136(%[b]), %%rax\n\t"
        "movq	%%rax, 136(%[r])\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "adcq	144(%[b]), %%rax\n\t"
        "movq	%%rax, 144(%[r])\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "adcq	152(%[b]), %%rax\n\t"
        "movq	%%rax, 152(%[r])\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "adcq	160(%[b]), %%rax\n\t"
        "movq	%%rax, 160(%[r])\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "adcq	168(%[b]), %%rax\n\t"
        "movq	%%rax, 168(%[r])\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "adcq	176(%[b]), %%rax\n\t"
        "movq	%%rax, 176(%[r])\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "adcq	184(%[b]), %%rax\n\t"
        "movq	%%rax, 184(%[r])\n\t"
        "movq	192(%[a]), %%rax\n\t"
        "adcq	192(%[b]), %%rax\n\t"
        "movq	%%rax, 192(%[r])\n\t"
        "movq	200(%[a]), %%rax\n\t"
        "adcq	200(%[b]), %%rax\n\t"
        "movq	%%rax, 200(%[r])\n\t"
        "movq	208(%[a]), %%rax\n\t"
        "adcq	208(%[b]), %%rax\n\t"
        "movq	%%rax, 208(%[r])\n\t"
        "movq	216(%[a]), %%rax\n\t"
        "adcq	216(%[b]), %%rax\n\t"
        "movq	%%rax, 216(%[r])\n\t"
        "movq	224(%[a]), %%rax\n\t"
        "adcq	224(%[b]), %%rax\n\t"
        "movq	%%rax, 224(%[r])\n\t"
        "movq	232(%[a]), %%rax\n\t"
        "adcq	232(%[b]), %%rax\n\t"
        "movq	%%rax, 232(%[r])\n\t"
        "movq	240(%[a]), %%rax\n\t"
        "adcq	240(%[b]), %%rax\n\t"
        "movq	%%rax, 240(%[r])\n\t"
        "movq	248(%[a]), %%rax\n\t"
        "adcq	248(%[b]), %%rax\n\t"
        "movq	%%rax, 248(%[r])\n\t"
        "adcq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax"
    );

    return c;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<16; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit b1[16];
    sp_digit z2[32];
    sp_digit u, ca, cb;

    ca = sp_2048_add_16(a1, a, &a[16]);
    cb = sp_2048_add_16(b1, b, &b[16]);
    u  = ca & cb;
    sp_2048_mul_16(z1, a1, b1);
    sp_2048_mul_16(z2, &a[16], &b[16]);
    sp_2048_mul_16(z0, a, b);
    sp_2048_mask_16(r + 32, a1, 0 - cb);
    sp_2048_mask_16(b1, b1, 0 - ca);
    u += sp_2048_add_16(r + 32, r + 32, b1);
    u += sp_2048_sub_in_place_32(z1, z2);
    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_add_32(r + 16, r + 16, z1);
    r[48] = u;
    XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1));
    sp_2048_add_32(r + 32, r + 32, z2);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z2[32];
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit u;

    u = sp_2048_add_16(a1, a, &a[16]);
    sp_2048_sqr_16(z1, a1);
    sp_2048_sqr_16(z2, &a[16]);
    sp_2048_sqr_16(z0, a);
    sp_2048_mask_16(r + 32, a1, 0 - u);
    u += sp_2048_add_16(r + 32, r + 32, r + 32);
    u += sp_2048_sub_in_place_32(z1, z2);
    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_add_32(r + 16, r + 16, z1);
    r[48] = u;
    XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1));
    sp_2048_add_32(r + 32, r + 32, z2);
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_avx2_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit b1[16];
    sp_digit z2[32];
    sp_digit u, ca, cb;

    ca = sp_2048_add_16(a1, a, &a[16]);
    cb = sp_2048_add_16(b1, b, &b[16]);
    u  = ca & cb;
    sp_2048_mul_avx2_16(z1, a1, b1);
    sp_2048_mul_avx2_16(z2, &a[16], &b[16]);
    sp_2048_mul_avx2_16(z0, a, b);
    sp_2048_mask_16(r + 32, a1, 0 - cb);
    sp_2048_mask_16(b1, b1, 0 - ca);
    u += sp_2048_add_16(r + 32, r + 32, b1);
    u += sp_2048_sub_in_place_32(z1, z2);
    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_add_32(r + 16, r + 16, z1);
    r[48] = u;
    XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1));
    sp_2048_add_32(r + 32, r + 32, z2);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_avx2_32(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z2[32];
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit u;

    u = sp_2048_add_16(a1, a, &a[16]);
    sp_2048_sqr_avx2_16(z1, a1);
    sp_2048_sqr_avx2_16(z2, &a[16]);
    sp_2048_sqr_avx2_16(z0, a);
    sp_2048_mask_16(r + 32, a1, 0 - u);
    u += sp_2048_add_16(r + 32, r + 32, r + 32);
    u += sp_2048_sub_in_place_32(z1, z2);
    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_add_32(r + 16, r + 16, z1);
    r[48] = u;
    XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1));
    sp_2048_add_32(r + 32, r + 32, z2);
}

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* Caclulate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_2048_mont_setup(sp_digit* a, sp_digit* rho)
{
    sp_digit x, b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */

    /* rho = -1/m mod b */
    *rho = -x;
}

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_in_place_16(sp_digit* a, const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[a]), %%r8\n\t"
        "movq	8(%[a]), %%r9\n\t"
        "movq	0(%[b]), %%rdx\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "subq	%%rdx, %%r8\n\t"
        "movq	16(%[b]), %%rdx\n\t"
        "movq	%%r8, 0(%[a])\n\t"
        "movq	16(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "movq	%%r9, 8(%[a])\n\t"
        "movq	24(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	32(%[b]), %%rdx\n\t"
        "movq	%%r8, 16(%[a])\n\t"
        "movq	32(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "movq	%%r9, 24(%[a])\n\t"
        "movq	40(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	48(%[b]), %%rdx\n\t"
        "movq	%%r8, 32(%[a])\n\t"
        "movq	48(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "movq	%%r9, 40(%[a])\n\t"
        "movq	56(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	64(%[b]), %%rdx\n\t"
        "movq	%%r8, 48(%[a])\n\t"
        "movq	64(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "movq	%%r9, 56(%[a])\n\t"
        "movq	72(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	80(%[b]), %%rdx\n\t"
        "movq	%%r8, 64(%[a])\n\t"
        "movq	80(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "movq	%%r9, 72(%[a])\n\t"
        "movq	88(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	96(%[b]), %%rdx\n\t"
        "movq	%%r8, 80(%[a])\n\t"
        "movq	96(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "movq	%%r9, 88(%[a])\n\t"
        "movq	104(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	112(%[b]), %%rdx\n\t"
        "movq	%%r8, 96(%[a])\n\t"
        "movq	112(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "movq	%%r9, 104(%[a])\n\t"
        "movq	120(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	%%r8, 112(%[a])\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	%%r9, 120(%[a])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [a] "r" (a), [b] "r" (b)
        : "memory", "rdx", "rcx", "r8", "r9"
    );

    return c;
}

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_2048_mont_norm_16(sp_digit* r, sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 16);

    /* r = 2^n mod m */
    sp_2048_sub_in_place_16(r, m);
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_sub_16(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit m)
{
    sp_digit t[16];
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[b]), %%rax\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 0(%[t])\n\t"
        "movq	%%rcx, 8(%[t])\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 16(%[t])\n\t"
        "movq	%%rcx, 24(%[t])\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 32(%[t])\n\t"
        "movq	%%rcx, 40(%[t])\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 48(%[t])\n\t"
        "movq	%%rcx, 56(%[t])\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 64(%[t])\n\t"
        "movq	%%rcx, 72(%[t])\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 80(%[t])\n\t"
        "movq	%%rcx, 88(%[t])\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 96(%[t])\n\t"
        "movq	%%rcx, 104(%[t])\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 112(%[t])\n\t"
        "movq	%%rcx, 120(%[t])\n\t"
        "movq	(%[a]), %%rax\n\t"
        "movq	(%[t]), %%rdx\n\t"
        "subq	%%rdx,%%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	8(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "movq	16(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "movq	24(%[a]), %%rcx\n\t"
        "movq	24(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "movq	32(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 24(%[r])\n\t"
        "movq	40(%[a]), %%rcx\n\t"
        "movq	40(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "movq	48(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 40(%[r])\n\t"
        "movq	56(%[a]), %%rcx\n\t"
        "movq	56(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "movq	64(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "movq	72(%[a]), %%rcx\n\t"
        "movq	72(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "movq	80(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 72(%[r])\n\t"
        "movq	88(%[a]), %%rcx\n\t"
        "movq	88(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "movq	96(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 88(%[r])\n\t"
        "movq	104(%[a]), %%rcx\n\t"
        "movq	104(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "movq	112(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "movq	120(%[a]), %%rcx\n\t"
        "movq	120(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	%%rcx, 120(%[r])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m), [t] "r" (t)
        : "memory", "rax", "rcx", "rdx"
    );

    return c;
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "movq	8(%[a]), %%r13\n\t"
        "\nL_mont_loop_16:\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%r10\n\t"
        "imulq	%[mp], %%r10\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	0(%[m])\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	8(%[m])\n\t"
        "movq	%%r13, %%r12\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r12\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	16(%[m])\n\t"
        "movq	16(%[a]), %%r13\n\t"
        "addq	%%rax, %%r13\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r13\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[m])\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	32(%[m])\n\t"
        "movq	32(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 32(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	40(%[m])\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	48(%[m])\n\t"
        "movq	48(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 48(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	56(%[m])\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	64(%[m])\n\t"
        "movq	64(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 64(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[m])\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	80(%[m])\n\t"
        "movq	80(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 80(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	88(%[m])\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	96(%[m])\n\t"
        "movq	96(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 96(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	104(%[m])\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	112(%[m])\n\t"
        "movq	112(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 112(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "mulq	120(%[m])\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%[ca], %%rdx\n\t"
        "movq	$0, %[ca]\n\t"
        "adcq	$0, %[ca]\n\t"
        "addq	%%r9, %%r11\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "adcq	%%rdx, 128(%[a])\n\t"
        "adcq	$0, %[ca]\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$8, %%rcx\n\t"
        "cmpq	$128, %%rcx\n\t"
        "jl	L_mont_loop_16\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        "movq	%%r13, 8(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13"
    );

    sp_2048_cond_sub_16(a - 16, a, m, (sp_digit)0 - ca);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_16(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_16(r, a, b);
    sp_2048_mont_reduce_16(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_16(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_16(r, a);
    sp_2048_mont_reduce_16(r, m, mp);
}

/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_2048_mul_d_16(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	(%[a])\n\t"
        "movq	%%rax, %%rbx\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[2] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[3] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 24(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[4] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 32(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[5] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 40(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[6] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 48(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[7] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[8] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[9] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 72(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[10] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 80(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[11] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 88(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[12] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 96(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[13] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[14] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[15] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 120(%[r])\n\t"
        "movq	%%rcx, 128(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );
}

/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_2048_mul_d_avx2_16(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rdx\n\t"
        "xorq	%%r10, %%r10\n\t"
        "mulxq	(%[a]), %%r8, %%r9\n\t"
        "movq	%%r8, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "mulxq	8(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 8(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[2] * B\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[3] * B\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 24(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[4] * B\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 32(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[5] * B\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 40(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[6] * B\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 48(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[7] * B\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 56(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[8] * B\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[9] * B\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 72(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[10] * B\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 80(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[11] * B\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 88(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[12] * B\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 96(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[13] * B\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 104(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[14] * B\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[15] * B\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "adcxq	%%r10, %%r8\n\t"
        "movq	%%r9, 120(%[r])\n\t"
        "movq	%%r8, 128(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10"
    );
}

/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The dividend.
 * returns the result of the division.
 */
static sp_digit div_2048_word_16(sp_digit d1, sp_digit d0, sp_digit div)
{
    sp_digit r;

    __asm__ __volatile__ (
        "movq	%[d0], %%rax\n\t"
        "movq	%[d1], %%rdx\n\t"
        "divq	%[div]\n\t"
        "movq	%%rax, %[r]\n\t"
        : [r] "=r" (r)
        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
        : "rax", "rdx"
    );

    return r;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static int64_t sp_2048_cmp_16(sp_digit* a, sp_digit* b)
{
    sp_digit r = -1;
    sp_digit one = 1;

    __asm__ __volatile__ (
        "xorq	%%rcx, %%rcx\n\t"
        "movq	$-1, %%rdx\n\t"
        "movq	120(%[a]), %%rbx\n\t"
        "movq	120(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	112(%[a]), %%rbx\n\t"
        "movq	112(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	104(%[a]), %%rbx\n\t"
        "movq	104(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	96(%[a]), %%rbx\n\t"
        "movq	96(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	88(%[a]), %%rbx\n\t"
        "movq	88(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	80(%[a]), %%rbx\n\t"
        "movq	80(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	72(%[a]), %%rbx\n\t"
        "movq	72(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	64(%[a]), %%rbx\n\t"
        "movq	64(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	56(%[a]), %%rbx\n\t"
        "movq	56(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	48(%[a]), %%rbx\n\t"
        "movq	48(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	40(%[a]), %%rbx\n\t"
        "movq	40(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	32(%[a]), %%rbx\n\t"
        "movq	32(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	24(%[a]), %%rbx\n\t"
        "movq	24(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	16(%[a]), %%rbx\n\t"
        "movq	16(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	8(%[a]), %%rbx\n\t"
        "movq	8(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	0(%[a]), %%rbx\n\t"
        "movq	0(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "xorq	%%rdx, %[r]\n\t"
        : [r] "+r" (r)
        : [a] "r" (a), [b] "r" (b), [one] "r" (one)
        : "rax", "rdx", "rcx", "rbx", "r8"
    );

    return r;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_div_16(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[32], t2[17];
    sp_digit div, r1;
    int i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)m;

    div = d[15];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 16);
    for (i=15; i>=0; i--) {
        r1 = div_2048_word_16(t1[16 + i], t1[16 + i - 1], div);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_d_avx2_16(t2, d, r1);
        else
#endif
            sp_2048_mul_d_16(t2, d, r1);
        t1[16 + i] += sp_2048_sub_in_place_16(&t1[i], t2);
        t1[16 + i] -= t2[16];
        sp_2048_mask_16(t2, d, t1[16 + i]);
        t1[16 + i] += sp_2048_add_16(&t1[i], &t1[i], t2);
        sp_2048_mask_16(t2, d, t1[16 + i]);
        t1[16 + i] += sp_2048_add_16(&t1[i], &t1[i], t2);
    }

    r1 = sp_2048_cmp_16(t1, d) >= 0;
    sp_2048_cond_sub_16(r, t1, t2, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_mod_16(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_16(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_16(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][32];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 32, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 32;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_16(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 16);
        if (reduceA) {
            err = sp_2048_mod_16(t[1] + 16, a, m);
            if (err == MP_OKAY)
                err = sp_2048_mod_16(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 16, a, sizeof(sp_digit) * 16);
            err = sp_2048_mod_16(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_16(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_16(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_16(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_16(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_16(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_16(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_16(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_16(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_16(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_16(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_16(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_16(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_16(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_16(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_16(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_16(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_16(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_16(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_16(t[20], t[10], m, mp);
        sp_2048_mont_mul_16(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_16(t[22], t[11], m, mp);
        sp_2048_mont_mul_16(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_16(t[24], t[12], m, mp);
        sp_2048_mont_mul_16(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_16(t[26], t[13], m, mp);
        sp_2048_mont_mul_16(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_16(t[28], t[14], m, mp);
        sp_2048_mont_mul_16(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_16(t[30], t[15], m, mp);
        sp_2048_mont_mul_16(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 16);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_16(r, r, m, mp);
            sp_2048_mont_sqr_16(r, r, m, mp);
            sp_2048_mont_sqr_16(r, r, m, mp);
            sp_2048_mont_sqr_16(r, r, m, mp);
            sp_2048_mont_sqr_16(r, r, m, mp);

            sp_2048_mont_mul_16(r, r, t[y], m, mp);
        }
        y = e[0] & 0xf;
        sp_2048_mont_sqr_16(r, r, m, mp);
        sp_2048_mont_sqr_16(r, r, m, mp);
        sp_2048_mont_sqr_16(r, r, m, mp);
        sp_2048_mont_sqr_16(r, r, m, mp);
        sp_2048_mont_mul_16(r, r, t[y], m, mp);

        XMEMSET(&r[16], 0, sizeof(sp_digit) * 16);
        sp_2048_mont_reduce_16(r, m, mp);

        mask = 0 - (sp_2048_cmp_16(r, m) >= 0);
        sp_2048_cond_sub_16(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_2048_mont_reduce_avx2_16(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "\nL_mont_loop_avx2_16:\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%r12, %%r10\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%rdx\n\t"
        "mulxq	%[mp], %%rdx, %%r8\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "mulxq	0(%[m]), %%rax, %%r8\n\t"
        "movq	8(%[a]), %%r12\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r12\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "mulxq	8(%[m]), %%rax, %%r8\n\t"
        "movq	16(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "mulxq	16(%[m]), %%rax, %%r8\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 16(%[a])\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "mulxq	24(%[m]), %%rax, %%r8\n\t"
        "movq	32(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "mulxq	32(%[m]), %%rax, %%r8\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 32(%[a])\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "mulxq	40(%[m]), %%rax, %%r8\n\t"
        "movq	48(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "mulxq	48(%[m]), %%rax, %%r8\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 48(%[a])\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "mulxq	56(%[m]), %%rax, %%r8\n\t"
        "movq	64(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "mulxq	64(%[m]), %%rax, %%r8\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 64(%[a])\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "mulxq	72(%[m]), %%rax, %%r8\n\t"
        "movq	80(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "mulxq	80(%[m]), %%rax, %%r8\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 80(%[a])\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "mulxq	88(%[m]), %%rax, %%r8\n\t"
        "movq	96(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "mulxq	96(%[m]), %%rax, %%r8\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 96(%[a])\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "mulxq	104(%[m]), %%rax, %%r8\n\t"
        "movq	112(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "mulxq	112(%[m]), %%rax, %%r8\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 112(%[a])\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "mulxq	120(%[m]), %%rax, %%r8\n\t"
        "movq	128(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "adcxq	%[ca], %%r10\n\t"
        "movq	%%r9, %[ca]\n\t"
        "adoxq	%%r9, %[ca]\n\t"
        "adcxq	%%r9, %[ca]\n\t"
        "movq	%%r10, 128(%[a])\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$1, %%rcx\n\t"
        "cmpq	$16, %%rcx\n\t"
        "jl	L_mont_loop_avx2_16\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    sp_2048_cond_sub_16(a - 16, a, m, (sp_digit)0 - ca);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_avx2_16(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_avx2_16(r, a, b);
    sp_2048_mont_reduce_avx2_16(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_avx2_16(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_avx2_16(r, a);
    sp_2048_mont_reduce_avx2_16(r, m, mp);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_avx2_16(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][32];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 32, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 32;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_16(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 16);
        if (reduceA) {
            err = sp_2048_mod_16(t[1] + 16, a, m);
            if (err == MP_OKAY)
                err = sp_2048_mod_16(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 16, a, sizeof(sp_digit) * 16);
            err = sp_2048_mod_16(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_avx2_16(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_avx2_16(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_avx2_16(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_avx2_16(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_avx2_16(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_avx2_16(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_avx2_16(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_avx2_16(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_avx2_16(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_avx2_16(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_avx2_16(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_avx2_16(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_avx2_16(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_avx2_16(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_avx2_16(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_avx2_16(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_avx2_16(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_avx2_16(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_avx2_16(t[20], t[10], m, mp);
        sp_2048_mont_mul_avx2_16(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_avx2_16(t[22], t[11], m, mp);
        sp_2048_mont_mul_avx2_16(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_avx2_16(t[24], t[12], m, mp);
        sp_2048_mont_mul_avx2_16(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_avx2_16(t[26], t[13], m, mp);
        sp_2048_mont_mul_avx2_16(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_avx2_16(t[28], t[14], m, mp);
        sp_2048_mont_mul_avx2_16(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_avx2_16(t[30], t[15], m, mp);
        sp_2048_mont_mul_avx2_16(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 16);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_avx2_16(r, r, m, mp);
            sp_2048_mont_sqr_avx2_16(r, r, m, mp);
            sp_2048_mont_sqr_avx2_16(r, r, m, mp);
            sp_2048_mont_sqr_avx2_16(r, r, m, mp);
            sp_2048_mont_sqr_avx2_16(r, r, m, mp);

            sp_2048_mont_mul_avx2_16(r, r, t[y], m, mp);
        }
        y = e[0] & 0xf;
        sp_2048_mont_sqr_avx2_16(r, r, m, mp);
        sp_2048_mont_sqr_avx2_16(r, r, m, mp);
        sp_2048_mont_sqr_avx2_16(r, r, m, mp);
        sp_2048_mont_sqr_avx2_16(r, r, m, mp);
        sp_2048_mont_mul_avx2_16(r, r, t[y], m, mp);

        XMEMSET(&r[16], 0, sizeof(sp_digit) * 16);
        sp_2048_mont_reduce_avx2_16(r, m, mp);

        mask = 0 - (sp_2048_cmp_16(r, m) >= 0);
        sp_2048_cond_sub_16(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_2048_mont_norm_32(sp_digit* r, sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 32);

    /* r = 2^n mod m */
    sp_2048_sub_in_place_32(r, m);
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_sub_32(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit m)
{
    sp_digit t[32];
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[b]), %%rax\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 0(%[t])\n\t"
        "movq	%%rcx, 8(%[t])\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 16(%[t])\n\t"
        "movq	%%rcx, 24(%[t])\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 32(%[t])\n\t"
        "movq	%%rcx, 40(%[t])\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 48(%[t])\n\t"
        "movq	%%rcx, 56(%[t])\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 64(%[t])\n\t"
        "movq	%%rcx, 72(%[t])\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 80(%[t])\n\t"
        "movq	%%rcx, 88(%[t])\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 96(%[t])\n\t"
        "movq	%%rcx, 104(%[t])\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 112(%[t])\n\t"
        "movq	%%rcx, 120(%[t])\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "movq	136(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 128(%[t])\n\t"
        "movq	%%rcx, 136(%[t])\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "movq	152(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 144(%[t])\n\t"
        "movq	%%rcx, 152(%[t])\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "movq	168(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 160(%[t])\n\t"
        "movq	%%rcx, 168(%[t])\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "movq	184(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 176(%[t])\n\t"
        "movq	%%rcx, 184(%[t])\n\t"
        "movq	192(%[b]), %%rax\n\t"
        "movq	200(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 192(%[t])\n\t"
        "movq	%%rcx, 200(%[t])\n\t"
        "movq	208(%[b]), %%rax\n\t"
        "movq	216(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 208(%[t])\n\t"
        "movq	%%rcx, 216(%[t])\n\t"
        "movq	224(%[b]), %%rax\n\t"
        "movq	232(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 224(%[t])\n\t"
        "movq	%%rcx, 232(%[t])\n\t"
        "movq	240(%[b]), %%rax\n\t"
        "movq	248(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 240(%[t])\n\t"
        "movq	%%rcx, 248(%[t])\n\t"
        "movq	(%[a]), %%rax\n\t"
        "movq	(%[t]), %%rdx\n\t"
        "subq	%%rdx,%%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	8(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "movq	16(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "movq	24(%[a]), %%rcx\n\t"
        "movq	24(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "movq	32(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 24(%[r])\n\t"
        "movq	40(%[a]), %%rcx\n\t"
        "movq	40(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "movq	48(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 40(%[r])\n\t"
        "movq	56(%[a]), %%rcx\n\t"
        "movq	56(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "movq	64(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "movq	72(%[a]), %%rcx\n\t"
        "movq	72(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "movq	80(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 72(%[r])\n\t"
        "movq	88(%[a]), %%rcx\n\t"
        "movq	88(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "movq	96(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 88(%[r])\n\t"
        "movq	104(%[a]), %%rcx\n\t"
        "movq	104(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "movq	112(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "movq	120(%[a]), %%rcx\n\t"
        "movq	120(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "movq	128(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 120(%[r])\n\t"
        "movq	136(%[a]), %%rcx\n\t"
        "movq	136(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 128(%[r])\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "movq	144(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 136(%[r])\n\t"
        "movq	152(%[a]), %%rcx\n\t"
        "movq	152(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 144(%[r])\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "movq	160(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "movq	168(%[a]), %%rcx\n\t"
        "movq	168(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 160(%[r])\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "movq	176(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 168(%[r])\n\t"
        "movq	184(%[a]), %%rcx\n\t"
        "movq	184(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 176(%[r])\n\t"
        "movq	192(%[a]), %%rax\n\t"
        "movq	192(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 184(%[r])\n\t"
        "movq	200(%[a]), %%rcx\n\t"
        "movq	200(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 192(%[r])\n\t"
        "movq	208(%[a]), %%rax\n\t"
        "movq	208(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "movq	216(%[a]), %%rcx\n\t"
        "movq	216(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 208(%[r])\n\t"
        "movq	224(%[a]), %%rax\n\t"
        "movq	224(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 216(%[r])\n\t"
        "movq	232(%[a]), %%rcx\n\t"
        "movq	232(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 224(%[r])\n\t"
        "movq	240(%[a]), %%rax\n\t"
        "movq	240(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 232(%[r])\n\t"
        "movq	248(%[a]), %%rcx\n\t"
        "movq	248(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 240(%[r])\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m), [t] "r" (t)
        : "memory", "rax", "rcx", "rdx"
    );

    return c;
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "movq	8(%[a]), %%r13\n\t"
        "\nL_mont_loop_32:\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%r10\n\t"
        "imulq	%[mp], %%r10\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	0(%[m])\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	8(%[m])\n\t"
        "movq	%%r13, %%r12\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r12\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	16(%[m])\n\t"
        "movq	16(%[a]), %%r13\n\t"
        "addq	%%rax, %%r13\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r13\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[m])\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	32(%[m])\n\t"
        "movq	32(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 32(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	40(%[m])\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	48(%[m])\n\t"
        "movq	48(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 48(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	56(%[m])\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	64(%[m])\n\t"
        "movq	64(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 64(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[m])\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	80(%[m])\n\t"
        "movq	80(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 80(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	88(%[m])\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	96(%[m])\n\t"
        "movq	96(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 96(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	104(%[m])\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	112(%[m])\n\t"
        "movq	112(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 112(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	120(%[m])\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+16] += m[16] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	128(%[m])\n\t"
        "movq	128(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 128(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+17] += m[17] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	136(%[m])\n\t"
        "movq	136(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 136(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+18] += m[18] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	144(%[m])\n\t"
        "movq	144(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 144(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+19] += m[19] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	152(%[m])\n\t"
        "movq	152(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 152(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+20] += m[20] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	160(%[m])\n\t"
        "movq	160(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 160(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+21] += m[21] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	168(%[m])\n\t"
        "movq	168(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 168(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+22] += m[22] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	176(%[m])\n\t"
        "movq	176(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 176(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+23] += m[23] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	184(%[m])\n\t"
        "movq	184(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 184(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+24] += m[24] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	192(%[m])\n\t"
        "movq	192(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 192(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+25] += m[25] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	200(%[m])\n\t"
        "movq	200(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 200(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+26] += m[26] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	208(%[m])\n\t"
        "movq	208(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 208(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+27] += m[27] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	216(%[m])\n\t"
        "movq	216(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 216(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+28] += m[28] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	224(%[m])\n\t"
        "movq	224(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 224(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+29] += m[29] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	232(%[m])\n\t"
        "movq	232(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 232(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+30] += m[30] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	240(%[m])\n\t"
        "movq	240(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 240(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+31] += m[31] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "mulq	248(%[m])\n\t"
        "movq	248(%[a]), %%r11\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%[ca], %%rdx\n\t"
        "movq	$0, %[ca]\n\t"
        "adcq	$0, %[ca]\n\t"
        "addq	%%r9, %%r11\n\t"
        "movq	%%r11, 248(%[a])\n\t"
        "adcq	%%rdx, 256(%[a])\n\t"
        "adcq	$0, %[ca]\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$8, %%rcx\n\t"
        "cmpq	$256, %%rcx\n\t"
        "jl	L_mont_loop_32\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        "movq	%%r13, 8(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13"
    );

    sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - ca);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_32(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_32(r, a, b);
    sp_2048_mont_reduce_32(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_32(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_32(r, a);
    sp_2048_mont_reduce_32(r, m, mp);
}

/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	(%[a])\n\t"
        "movq	%%rax, %%rbx\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[2] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[3] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 24(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[4] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 32(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[5] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 40(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[6] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 48(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[7] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[8] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[9] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 72(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[10] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 80(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[11] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 88(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[12] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 96(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[13] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[14] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[15] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 120(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[16] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 128(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[17] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 136(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[18] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 144(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[19] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[20] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[21] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 168(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[22] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 176(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[23] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 184(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[24] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	192(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 192(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[25] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	200(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[26] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	208(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[27] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	216(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 216(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[28] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	224(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 224(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[29] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	232(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 232(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[30] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	240(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 240(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[31] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "mulq	248(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        "movq	%%r8, 256(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );
}

/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_2048_mul_d_avx2_32(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rdx\n\t"
        "xorq	%%r10, %%r10\n\t"
        "mulxq	(%[a]), %%r8, %%r9\n\t"
        "movq	%%r8, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "mulxq	8(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 8(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[2] * B\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[3] * B\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 24(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[4] * B\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 32(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[5] * B\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 40(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[6] * B\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 48(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[7] * B\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 56(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[8] * B\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[9] * B\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 72(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[10] * B\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 80(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[11] * B\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 88(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[12] * B\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 96(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[13] * B\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 104(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[14] * B\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[15] * B\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 120(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[16] * B\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 128(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[17] * B\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 136(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[18] * B\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 144(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[19] * B\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 152(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[20] * B\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[21] * B\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 168(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[22] * B\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 176(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[23] * B\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 184(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[24] * B\n\t"
        "mulxq	192(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 192(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[25] * B\n\t"
        "mulxq	200(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 200(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[26] * B\n\t"
        "mulxq	208(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[27] * B\n\t"
        "mulxq	216(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 216(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[28] * B\n\t"
        "mulxq	224(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 224(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[29] * B\n\t"
        "mulxq	232(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 232(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[30] * B\n\t"
        "mulxq	240(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 240(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[31] * B\n\t"
        "mulxq	248(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "adcxq	%%r10, %%r8\n\t"
        "movq	%%r9, 248(%[r])\n\t"
        "movq	%%r8, 256(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10"
    );
}

/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The dividend.
 * returns the result of the division.
 */
static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
{
    sp_digit r;

    __asm__ __volatile__ (
        "movq	%[d0], %%rax\n\t"
        "movq	%[d1], %%rdx\n\t"
        "divq	%[div]\n\t"
        "movq	%%rax, %[r]\n\t"
        : [r] "=r" (r)
        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
        : "rax", "rdx"
    );

    return r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<32; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static int64_t sp_2048_cmp_32(sp_digit* a, sp_digit* b)
{
    sp_digit r = -1;
    sp_digit one = 1;

    __asm__ __volatile__ (
        "xorq	%%rcx, %%rcx\n\t"
        "movq	$-1, %%rdx\n\t"
        "movq	248(%[a]), %%rbx\n\t"
        "movq	248(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	240(%[a]), %%rbx\n\t"
        "movq	240(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	232(%[a]), %%rbx\n\t"
        "movq	232(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	224(%[a]), %%rbx\n\t"
        "movq	224(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	216(%[a]), %%rbx\n\t"
        "movq	216(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	208(%[a]), %%rbx\n\t"
        "movq	208(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	200(%[a]), %%rbx\n\t"
        "movq	200(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	192(%[a]), %%rbx\n\t"
        "movq	192(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	184(%[a]), %%rbx\n\t"
        "movq	184(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	176(%[a]), %%rbx\n\t"
        "movq	176(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	168(%[a]), %%rbx\n\t"
        "movq	168(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	160(%[a]), %%rbx\n\t"
        "movq	160(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	152(%[a]), %%rbx\n\t"
        "movq	152(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	144(%[a]), %%rbx\n\t"
        "movq	144(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	136(%[a]), %%rbx\n\t"
        "movq	136(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	128(%[a]), %%rbx\n\t"
        "movq	128(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	120(%[a]), %%rbx\n\t"
        "movq	120(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	112(%[a]), %%rbx\n\t"
        "movq	112(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	104(%[a]), %%rbx\n\t"
        "movq	104(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	96(%[a]), %%rbx\n\t"
        "movq	96(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	88(%[a]), %%rbx\n\t"
        "movq	88(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	80(%[a]), %%rbx\n\t"
        "movq	80(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	72(%[a]), %%rbx\n\t"
        "movq	72(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	64(%[a]), %%rbx\n\t"
        "movq	64(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	56(%[a]), %%rbx\n\t"
        "movq	56(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	48(%[a]), %%rbx\n\t"
        "movq	48(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	40(%[a]), %%rbx\n\t"
        "movq	40(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	32(%[a]), %%rbx\n\t"
        "movq	32(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	24(%[a]), %%rbx\n\t"
        "movq	24(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	16(%[a]), %%rbx\n\t"
        "movq	16(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	8(%[a]), %%rbx\n\t"
        "movq	8(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	0(%[a]), %%rbx\n\t"
        "movq	0(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "xorq	%%rdx, %[r]\n\t"
        : [r] "+r" (r)
        : [a] "r" (a), [b] "r" (b), [one] "r" (one)
        : "rax", "rdx", "rcx", "rbx", "r8"
    );

    return r;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_div_32(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[64], t2[33];
    sp_digit div, r1;
    int i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)m;

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
    for (i=31; i>=0; i--) {
        r1 = div_2048_word_32(t1[32 + i], t1[32 + i - 1], div);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_d_avx2_32(t2, d, r1);
        else
#endif
            sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
        t1[32 + i] -= t2[32];
        sp_2048_mask_32(t2, d, t1[32 + i]);
        t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], t2);
        sp_2048_mask_32(t2, d, t1[32 + i]);
        t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], t2);
    }

    r1 = sp_2048_cmp_32(t1, d) >= 0;
    sp_2048_cond_sub_32(r, t1, t2, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_mod_32(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_32(a, m, NULL, r);
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_div_32_cond(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[64], t2[33];
    sp_digit div, r1;
    int i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)m;

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
    for (i=31; i>=0; i--) {
        r1 = div_2048_word_32(t1[32 + i], t1[32 + i - 1], div);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_d_avx2_32(t2, d, r1);
        else
#endif
            sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
        t1[32 + i] -= t2[32];
        if (t1[32 + i] != 0) {
            t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d);
            if (t1[32 + i] != 0)
                t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d);
        }
    }

    r1 = sp_2048_cmp_32(t1, d) >= 0;
    sp_2048_cond_sub_32(r, t1, t2, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_mod_32_cond(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_32_cond(a, m, NULL, r);
}

#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][64];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 64, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 64;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_32(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 32);
        if (reduceA) {
            err = sp_2048_mod_32(t[1] + 32, a, m);
            if (err == MP_OKAY)
                err = sp_2048_mod_32(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_32(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_32(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_32(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_32(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_32(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_32(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_32(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_32(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_32(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_32(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_32(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_32(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_32(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_32(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_32(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_32(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_32(t[20], t[10], m, mp);
        sp_2048_mont_mul_32(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_32(t[22], t[11], m, mp);
        sp_2048_mont_mul_32(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_32(t[24], t[12], m, mp);
        sp_2048_mont_mul_32(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_32(t[26], t[13], m, mp);
        sp_2048_mont_mul_32(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_32(t[28], t[14], m, mp);
        sp_2048_mont_mul_32(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_32(t[30], t[15], m, mp);
        sp_2048_mont_mul_32(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);

            sp_2048_mont_mul_32(r, r, t[y], m, mp);
        }
        y = e[0] & 0x7;
        sp_2048_mont_sqr_32(r, r, m, mp);
        sp_2048_mont_sqr_32(r, r, m, mp);
        sp_2048_mont_sqr_32(r, r, m, mp);
        sp_2048_mont_mul_32(r, r, t[y], m, mp);

        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32);
        sp_2048_mont_reduce_32(r, m, mp);

        mask = 0 - (sp_2048_cmp_32(r, m) >= 0);
        sp_2048_cond_sub_32(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_2048_mont_reduce_avx2_32(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "\nL_mont_loop_avx2_32:\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%r12, %%r10\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%rdx\n\t"
        "mulxq	%[mp], %%rdx, %%r8\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "mulxq	0(%[m]), %%rax, %%r8\n\t"
        "movq	8(%[a]), %%r12\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r12\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "mulxq	8(%[m]), %%rax, %%r8\n\t"
        "movq	16(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "mulxq	16(%[m]), %%rax, %%r8\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 16(%[a])\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "mulxq	24(%[m]), %%rax, %%r8\n\t"
        "movq	32(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "mulxq	32(%[m]), %%rax, %%r8\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 32(%[a])\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "mulxq	40(%[m]), %%rax, %%r8\n\t"
        "movq	48(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "mulxq	48(%[m]), %%rax, %%r8\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 48(%[a])\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "mulxq	56(%[m]), %%rax, %%r8\n\t"
        "movq	64(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "mulxq	64(%[m]), %%rax, %%r8\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 64(%[a])\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "mulxq	72(%[m]), %%rax, %%r8\n\t"
        "movq	80(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "mulxq	80(%[m]), %%rax, %%r8\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 80(%[a])\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "mulxq	88(%[m]), %%rax, %%r8\n\t"
        "movq	96(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "mulxq	96(%[m]), %%rax, %%r8\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 96(%[a])\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "mulxq	104(%[m]), %%rax, %%r8\n\t"
        "movq	112(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "mulxq	112(%[m]), %%rax, %%r8\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 112(%[a])\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "mulxq	120(%[m]), %%rax, %%r8\n\t"
        "movq	128(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "# a[i+16] += m[16] * mu\n\t"
        "mulxq	128(%[m]), %%rax, %%r8\n\t"
        "movq	136(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 128(%[a])\n\t"
        "# a[i+17] += m[17] * mu\n\t"
        "mulxq	136(%[m]), %%rax, %%r8\n\t"
        "movq	144(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 136(%[a])\n\t"
        "# a[i+18] += m[18] * mu\n\t"
        "mulxq	144(%[m]), %%rax, %%r8\n\t"
        "movq	152(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 144(%[a])\n\t"
        "# a[i+19] += m[19] * mu\n\t"
        "mulxq	152(%[m]), %%rax, %%r8\n\t"
        "movq	160(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 152(%[a])\n\t"
        "# a[i+20] += m[20] * mu\n\t"
        "mulxq	160(%[m]), %%rax, %%r8\n\t"
        "movq	168(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 160(%[a])\n\t"
        "# a[i+21] += m[21] * mu\n\t"
        "mulxq	168(%[m]), %%rax, %%r8\n\t"
        "movq	176(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 168(%[a])\n\t"
        "# a[i+22] += m[22] * mu\n\t"
        "mulxq	176(%[m]), %%rax, %%r8\n\t"
        "movq	184(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 176(%[a])\n\t"
        "# a[i+23] += m[23] * mu\n\t"
        "mulxq	184(%[m]), %%rax, %%r8\n\t"
        "movq	192(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 184(%[a])\n\t"
        "# a[i+24] += m[24] * mu\n\t"
        "mulxq	192(%[m]), %%rax, %%r8\n\t"
        "movq	200(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 192(%[a])\n\t"
        "# a[i+25] += m[25] * mu\n\t"
        "mulxq	200(%[m]), %%rax, %%r8\n\t"
        "movq	208(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 200(%[a])\n\t"
        "# a[i+26] += m[26] * mu\n\t"
        "mulxq	208(%[m]), %%rax, %%r8\n\t"
        "movq	216(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 208(%[a])\n\t"
        "# a[i+27] += m[27] * mu\n\t"
        "mulxq	216(%[m]), %%rax, %%r8\n\t"
        "movq	224(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 216(%[a])\n\t"
        "# a[i+28] += m[28] * mu\n\t"
        "mulxq	224(%[m]), %%rax, %%r8\n\t"
        "movq	232(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 224(%[a])\n\t"
        "# a[i+29] += m[29] * mu\n\t"
        "mulxq	232(%[m]), %%rax, %%r8\n\t"
        "movq	240(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 232(%[a])\n\t"
        "# a[i+30] += m[30] * mu\n\t"
        "mulxq	240(%[m]), %%rax, %%r8\n\t"
        "movq	248(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 240(%[a])\n\t"
        "# a[i+31] += m[31] * mu\n\t"
        "mulxq	248(%[m]), %%rax, %%r8\n\t"
        "movq	256(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 248(%[a])\n\t"
        "adcxq	%[ca], %%r10\n\t"
        "movq	%%r9, %[ca]\n\t"
        "adoxq	%%r9, %[ca]\n\t"
        "adcxq	%%r9, %[ca]\n\t"
        "movq	%%r10, 256(%[a])\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$1, %%rcx\n\t"
        "cmpq	$32, %%rcx\n\t"
        "jl	L_mont_loop_avx2_32\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - ca);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_avx2_32(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_avx2_32(r, a, b);
    sp_2048_mont_reduce_avx2_32(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_avx2_32(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_avx2_32(r, a);
    sp_2048_mont_reduce_avx2_32(r, m, mp);
}

#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_avx2_32(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][64];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 64, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 64;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_32(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 32);
        if (reduceA) {
            err = sp_2048_mod_32(t[1] + 32, a, m);
            if (err == MP_OKAY)
                err = sp_2048_mod_32(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_avx2_32(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_avx2_32(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_avx2_32(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_avx2_32(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_avx2_32(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_avx2_32(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_avx2_32(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_avx2_32(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_avx2_32(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_avx2_32(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_avx2_32(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_avx2_32(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_avx2_32(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_avx2_32(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_avx2_32(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_avx2_32(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_avx2_32(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_avx2_32(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_avx2_32(t[20], t[10], m, mp);
        sp_2048_mont_mul_avx2_32(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_avx2_32(t[22], t[11], m, mp);
        sp_2048_mont_mul_avx2_32(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_avx2_32(t[24], t[12], m, mp);
        sp_2048_mont_mul_avx2_32(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_avx2_32(t[26], t[13], m, mp);
        sp_2048_mont_mul_avx2_32(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_avx2_32(t[28], t[14], m, mp);
        sp_2048_mont_mul_avx2_32(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_avx2_32(t[30], t[15], m, mp);
        sp_2048_mont_mul_avx2_32(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_avx2_32(r, r, m, mp);
            sp_2048_mont_sqr_avx2_32(r, r, m, mp);
            sp_2048_mont_sqr_avx2_32(r, r, m, mp);
            sp_2048_mont_sqr_avx2_32(r, r, m, mp);
            sp_2048_mont_sqr_avx2_32(r, r, m, mp);

            sp_2048_mont_mul_avx2_32(r, r, t[y], m, mp);
        }
        y = e[0] & 0x7;
        sp_2048_mont_sqr_avx2_32(r, r, m, mp);
        sp_2048_mont_sqr_avx2_32(r, r, m, mp);
        sp_2048_mont_sqr_avx2_32(r, r, m, mp);
        sp_2048_mont_mul_avx2_32(r, r, t[y], m, mp);

        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32);
        sp_2048_mont_reduce_avx2_32(r, m, mp);

        mask = 0 - (sp_2048_cmp_32(r, m) >= 0);
        sp_2048_cond_sub_32(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_2048(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
    byte* out, word32* outLen)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[64], md[32], rd[64];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* a;
    sp_digit *ah;
    sp_digit* m;
    sp_digit* r;
    sp_digit e[1];
    int err = MP_OKAY;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 64 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 32 * 2;
        m = r + 32 * 2;
        ah = a + 32;
    }
#else
    a = ad;
    m = md;
    r = rd;
    ah = a + 32;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_bin(ah, 32, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 32, mm);

        if (e[0] == 0x3) {
#ifdef HAVE_INTEL_AVX2
            if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) {
                if (err == MP_OKAY) {
                    sp_2048_sqr_avx2_32(r, ah);
                    err = sp_2048_mod_32_cond(r, r, m);
                }
                if (err == MP_OKAY) {
                    sp_2048_mul_avx2_32(r, ah, r);
                    err = sp_2048_mod_32_cond(r, r, m);
                }
            }
            else
#endif
            {
                if (err == MP_OKAY) {
                    sp_2048_sqr_32(r, ah);
                    err = sp_2048_mod_32_cond(r, r, m);
                }
                if (err == MP_OKAY) {
                    sp_2048_mul_32(r, ah, r);
                    err = sp_2048_mod_32_cond(r, r, m);
                }
            }
        }
        else {
            int i;
            sp_digit mp;

            sp_2048_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32_cond(a, a, m);

            if (err == MP_OKAY) {
                for (i=63; i>=0; i--)
                    if (e[0] >> i)
                        break;

                XMEMCPY(r, a, sizeof(sp_digit) * 32);
#ifdef HAVE_INTEL_AVX2
                if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) {
                    for (i--; i>=0; i--) {
                        sp_2048_mont_sqr_avx2_32(r, r, m, mp);
                        if (((e[0] >> i) & 1) == 1)
                            sp_2048_mont_mul_avx2_32(r, r, a, m, mp);
                    }
                    XMEMSET(&r[32], 0, sizeof(sp_digit) * 32);
                    sp_2048_mont_reduce_avx2_32(r, m, mp);
                }
                else
#endif
                {
                    for (i--; i>=0; i--) {
                        sp_2048_mont_sqr_32(r, r, m, mp);
                        if (((e[0] >> i) & 1) == 1)
                            sp_2048_mont_mul_32(r, r, a, m, mp);
                    }
                    XMEMSET(&r[32], 0, sizeof(sp_digit) * 32);
                    sp_2048_mont_reduce_32(r, m, mp);
                }

                for (i = 31; i > 0; i--) {
                    if (r[i] != m[i])
                        break;
                }
                if (r[i] >= m[i])
                    sp_2048_sub_in_place_32(r, m);
            }
        }
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_2048(const byte* in, word32 inLen, mp_int* dm,
    mp_int* pm, mp_int* qm, mp_int* dpm, mp_int* dqm, mp_int* qim, mp_int* mm,
    byte* out, word32* outLen)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[32 * 2];
    sp_digit pd[16], qd[16], dpd[16];
    sp_digit tmpad[32], tmpbd[32];
#else
    sp_digit* t = NULL;
#endif
    sp_digit* a;
    sp_digit* p;
    sp_digit* q;
    sp_digit* dp;
    sp_digit* dq;
    sp_digit* qi;
    sp_digit* tmp;
    sp_digit* tmpa;
    sp_digit* tmpb;
    sp_digit* r;
    sp_digit c;
    int err = MP_OKAY;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)dm;
    (void)mm;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 256 || mp_count_bits(mm) != 2048))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 11, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (t == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = t;
        p = a + 32 * 2;
        q = p + 16;
        qi = dq = dp = q + 16;
        tmpa = qi + 16;
        tmpb = tmpa + 32;

        tmp = t;
        r = tmp + 32;
    }
#else
    r = a = ad;
    p = pd;
    q = qd;
    qi = dq = dp = dpd;
    tmpa = tmpad;
    tmpb = tmpbd;
    tmp = a + 32;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_bin(a, 32, in, inLen);
        sp_2048_from_mp(p, 16, pm);
        sp_2048_from_mp(q, 16, qm);
        sp_2048_from_mp(dp, 16, dpm);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_2048_mod_exp_avx2_16(tmpa, a, dp, 1024, p, 1);
        else
#endif
            err = sp_2048_mod_exp_16(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(dq, 16, dqm);
#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_2048_mod_exp_avx2_16(tmpb, a, dq, 1024, q, 1);
       else
#endif
            err = sp_2048_mod_exp_16(tmpb, a, dq, 1024, q, 1);
    }

    if (err == MP_OKAY) {
        c = sp_2048_sub_in_place_16(tmpa, tmpb);
        sp_2048_mask_16(tmp, p, c);
        sp_2048_add_16(tmpa, tmpa, tmp);

        sp_2048_from_mp(qi, 16, qim);
#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_avx2_16(tmpa, tmpa, qi);
        else
#endif
            sp_2048_mul_16(tmpa, tmpa, qi);
        err = sp_2048_mod_16(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_avx2_16(tmpa, q, tmpa);
        else
#endif
            sp_2048_mul_16(tmpa, q, tmpa);
        XMEMSET(&tmpb[16], 0, sizeof(sp_digit) * 16);
        sp_2048_add_32(r, tmpb, tmpa);

        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (t != NULL) {
        XMEMSET(t, 0, sizeof(sp_digit) * 16 * 11);
        XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
#else
    XMEMSET(tmpad, 0, sizeof(tmpad));
    XMEMSET(tmpbd, 0, sizeof(tmpbd));
    XMEMSET(pd, 0, sizeof(pd));
    XMEMSET(qd, 0, sizeof(qd));
    XMEMSET(dpd, 0, sizeof(dpd));
#endif

    return err;
}
#endif /* WOLFSSL_HAVE_SP_RSA */
#ifdef WOLFSSL_HAVE_SP_DH
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_2048_to_mp(sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (2048 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) {
#if DIGIT_BIT == 64
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 32);
        r->used = 32;
        mp_clamp(r);
#elif DIGIT_BIT < 64
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 32; i++) {
            r->dp[j] |= a[i] << s;
            r->dp[j] &= (1l << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = a[i] >> s;
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
                r->dp[++j] = a[i] >> s;
            }
            s = 64 - s;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 32; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 64 >= DIGIT_BIT) {
    #if DIGIT_BIT < 64
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 64 - s;
            }
            else
                s += 64;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[64], e[32], m[32];
    sp_digit* r = b;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048 || expBits > 2048 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 32, base);
        sp_2048_from_mp(e, 32, exp);
        sp_2048_from_mp(m, 32, mod);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_2048_mod_exp_avx2_32(r, b, e, expBits, m, 0);
        else
#endif
            err = sp_2048_mod_exp_32(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 256 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen,
    mp_int* mod, byte* out, word32* outLen)
{
    int err = MP_OKAY;
    sp_digit b[64], e[32], m[32];
    sp_digit* r = b;
    word32 i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    if (mp_count_bits(base) > 2048 || expLen > 256 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 32, base);
        sp_2048_from_bin(e, 32, exp, expLen);
        sp_2048_from_mp(m, 32, mod);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_2048_mod_exp_avx2_32(r, b, e, expLen * 8, m, 0);
        else
#endif
            err = sp_2048_mod_exp_32(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
        for (i=0; i<256 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_SP_NO_2048 */
#endif /* SP_WORD_SIZE == 64 */

#endif
#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
#if SP_WORD_SIZE == 64
#ifndef WOLFSSL_SP_NO_3072
/* Read big endian unsigned byte aray into r.
 *
 * r  A single precision integer.
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_3072_from_bin(sp_digit* r, int max, const byte* a, int n)
{
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= ((sp_digit)a[i]) << s;
        if (s >= 56) {
            r[j] &= 0xffffffffffffffffl;
            s = 64 - s;
            if (j + 1 >= max)
                break;
            r[++j] = a[i] >> s;
            s = 8 - s;
        }
        else
            s += 8;
    }

    for (j++; j < max; j++)
        r[j] = 0;
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * a  A multi-precision integer.
 */
static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a)
{
#if DIGIT_BIT == 64
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < max; j++)
        r[j] = 0;
#elif DIGIT_BIT > 64
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= a->dp[i] << s;
        r[j] &= 0xffffffffffffffffl;
        s = 64 - s;
        if (j + 1 >= max)
            break;
        r[++j] = a->dp[i] >> s;
        while (s + 64 <= DIGIT_BIT) {
            s += 64;
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            if (s < DIGIT_BIT)
                r[++j] = a->dp[i] >> s;
            else
                r[++j] = 0;
        }
        s = DIGIT_BIT - s;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#else
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 64) {
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            s = 64 - s;
            r[++j] = a->dp[i] >> s;
            s = DIGIT_BIT - s;
        }
        else
            s += DIGIT_BIT;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#endif
}

/* Write r as big endian to byte aray.
 * Fixed length number of bytes written: 384
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_3072_to_bin(sp_digit* r, byte* a)
{
    int i, j, s = 0, b;

    j = 3072 / 8 - 1;
    a[j] = 0;
    for (i=0; i<48 && j>=0; i++) {
        b = 0;
        a[j--] |= r[i] << s; b += 8 - s;
        if (j < 0)
            break;
        while (b < 64) {
            a[j--] = r[i] >> b; b += 8;
            if (j < 0)
                break;
        }
        if (j < 0)
            break;
        s = 8 - (b - 64);
        a[j] = 0;
        if (s != 0)
            j++;
    }
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    sp_digit tmp[24];

    __asm__ __volatile__ (
        "#  A[0] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "movq	%%rax, (%[tmp])\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "#  A[0] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 8(%[tmp])\n\t"
        "#  A[0] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 16(%[tmp])\n\t"
        "#  A[0] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 24(%[tmp])\n\t"
        "#  A[0] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 32(%[tmp])\n\t"
        "#  A[0] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 40(%[tmp])\n\t"
        "#  A[0] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 48(%[tmp])\n\t"
        "#  A[0] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 56(%[tmp])\n\t"
        "#  A[0] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 64(%[tmp])\n\t"
        "#  A[0] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 72(%[tmp])\n\t"
        "#  A[0] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 80(%[tmp])\n\t"
        "#  A[0] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 88(%[tmp])\n\t"
        "#  A[0] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 96(%[tmp])\n\t"
        "#  A[0] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 104(%[tmp])\n\t"
        "#  A[0] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 112(%[tmp])\n\t"
        "#  A[0] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 120(%[tmp])\n\t"
        "#  A[0] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 128(%[tmp])\n\t"
        "#  A[0] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 136(%[tmp])\n\t"
        "#  A[0] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 144(%[tmp])\n\t"
        "#  A[0] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 152(%[tmp])\n\t"
        "#  A[0] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 160(%[tmp])\n\t"
        "#  A[0] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 168(%[tmp])\n\t"
        "#  A[0] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 176(%[tmp])\n\t"
        "#  A[0] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 184(%[tmp])\n\t"
        "#  A[1] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 192(%[r])\n\t"
        "#  A[2] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "#  A[3] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "#  A[4] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 216(%[r])\n\t"
        "#  A[5] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 224(%[r])\n\t"
        "#  A[6] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 232(%[r])\n\t"
        "#  A[7] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 240(%[r])\n\t"
        "#  A[8] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        "#  A[9] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 256(%[r])\n\t"
        "#  A[10] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 264(%[r])\n\t"
        "#  A[11] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 272(%[r])\n\t"
        "#  A[12] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 280(%[r])\n\t"
        "#  A[13] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 288(%[r])\n\t"
        "#  A[14] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 296(%[r])\n\t"
        "#  A[15] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 304(%[r])\n\t"
        "#  A[16] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 312(%[r])\n\t"
        "#  A[17] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 320(%[r])\n\t"
        "#  A[18] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 328(%[r])\n\t"
        "#  A[19] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 336(%[r])\n\t"
        "#  A[20] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 344(%[r])\n\t"
        "#  A[21] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 352(%[r])\n\t"
        "#  A[22] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 360(%[r])\n\t"
        "#  A[23] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "movq	%%rcx, 368(%[r])\n\t"
        "movq	%%r8, 376(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a)
{
    sp_digit tmp[24];

    __asm__ __volatile__ (
        "#  A[0] * A[0]\n\t"
        "movq	0(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%rax, (%[tmp])\n\t"
        "movq	%%rdx, %%r8\n\t"
        "#  A[0] * A[1]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 8(%[tmp])\n\t"
        "#  A[0] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * A[1]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 16(%[tmp])\n\t"
        "#  A[0] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[1] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 24(%[tmp])\n\t"
        "#  A[0] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 32(%[tmp])\n\t"
        "#  A[0] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 40(%[tmp])\n\t"
        "#  A[0] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 48(%[tmp])\n\t"
        "#  A[0] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 56(%[tmp])\n\t"
        "#  A[0] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 64(%[tmp])\n\t"
        "#  A[0] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 72(%[tmp])\n\t"
        "#  A[0] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 80(%[tmp])\n\t"
        "#  A[0] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 88(%[tmp])\n\t"
        "#  A[0] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 96(%[tmp])\n\t"
        "#  A[0] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 104(%[tmp])\n\t"
        "#  A[0] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 112(%[tmp])\n\t"
        "#  A[0] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 120(%[tmp])\n\t"
        "#  A[0] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 128(%[tmp])\n\t"
        "#  A[0] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 136(%[tmp])\n\t"
        "#  A[0] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 144(%[tmp])\n\t"
        "#  A[0] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 152(%[tmp])\n\t"
        "#  A[0] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 160(%[tmp])\n\t"
        "#  A[0] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 168(%[tmp])\n\t"
        "#  A[0] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 176(%[tmp])\n\t"
        "#  A[0] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 184(%[tmp])\n\t"
        "#  A[1] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[2] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 192(%[r])\n\t"
        "#  A[2] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[3] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 200(%[r])\n\t"
        "#  A[3] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[4] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 208(%[r])\n\t"
        "#  A[4] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[5] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 216(%[r])\n\t"
        "#  A[5] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[6] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 224(%[r])\n\t"
        "#  A[6] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[7] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 232(%[r])\n\t"
        "#  A[7] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[8] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 240(%[r])\n\t"
        "#  A[8] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[9] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 248(%[r])\n\t"
        "#  A[9] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[10] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 256(%[r])\n\t"
        "#  A[10] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[11] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 264(%[r])\n\t"
        "#  A[11] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[12] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 272(%[r])\n\t"
        "#  A[12] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[13] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 280(%[r])\n\t"
        "#  A[13] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[14] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[18] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 288(%[r])\n\t"
        "#  A[14] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[15] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[18] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 296(%[r])\n\t"
        "#  A[15] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[16] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[18] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[19] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 304(%[r])\n\t"
        "#  A[16] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[17] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[18] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[19] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 312(%[r])\n\t"
        "#  A[17] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[18] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[19] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[20] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 320(%[r])\n\t"
        "#  A[18] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[19] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[20] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 328(%[r])\n\t"
        "#  A[19] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[20] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[21] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 336(%[r])\n\t"
        "#  A[20] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 344(%[r])\n\t"
        "#  A[21] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 352(%[r])\n\t"
        "#  A[22] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 360(%[r])\n\t"
        "#  A[23] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "movq	%%r8, 368(%[r])\n\t"
        "movq	%%r9, 376(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

/* Multiply a and b into r. (r = a * b)
 *
 * r   Result of multiplication.
 * a   First number to multiply.
 * b   Second number to multiply.
 */
static void sp_3072_mul_avx2_24(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit tmp[2*24];

    __asm__ __volatile__ (
        "movq	0(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "# A[0] * B[0]\n\t"
        "mulx	0(%[b]), %%r10, %%r11\n\t"
        "# A[0] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "# A[0] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "movq	%%r10, 0(%[t])\n\t"
        "movq	%%r11, 8(%[t])\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "# A[0] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "# A[0] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "# A[0] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "# A[0] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "# A[0] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "# A[0] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "# A[0] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "# A[0] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "# A[0] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "# A[0] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adcxq	%%r15, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	8(%[t]), %%r11\n\t"
        "movq	16(%[t]), %%r12\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "# A[1] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[1] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 8(%[t])\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "# A[1] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[1] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[1] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[1] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[1] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[1] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[1] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[1] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[1] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[1] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	16(%[t]), %%r12\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "# A[2] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[2] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "# A[2] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[2] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[2] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[2] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[2] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[2] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[2] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[2] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[2] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[2] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "# A[3] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[3] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "# A[3] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[3] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[3] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[3] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[3] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[3] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[3] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[3] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[3] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[3] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "# A[4] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[4] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "# A[4] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[4] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[4] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[4] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[4] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[4] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[4] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[4] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[4] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[4] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "# A[5] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[5] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[5] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[5] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[5] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[5] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[5] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[5] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[5] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[5] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[5] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[5] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "# A[6] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[6] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[6] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[6] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[6] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[6] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[6] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[6] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[6] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[6] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[6] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[6] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "# A[7] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[7] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[7] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[7] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[7] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[7] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[7] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[7] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[7] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[7] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[7] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[7] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "# A[8] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[8] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[8] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[8] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[8] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[8] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[8] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[8] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[8] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[8] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[8] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[8] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	72(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[9] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[9] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[9] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[9] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[9] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[9] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[9] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[9] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[9] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[9] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "# A[9] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[9] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[10] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[10] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[10] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[10] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[10] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[10] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[10] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[10] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[10] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[10] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "# A[10] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[10] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[11] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[11] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[11] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[11] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[11] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[11] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[11] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[11] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[11] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[11] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "# A[11] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[11] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[12] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[12] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[12] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[12] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[12] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[12] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[12] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[12] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "# A[12] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[12] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "# A[12] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[12] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[13] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[13] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[13] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[13] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[13] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[13] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[13] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[13] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "# A[13] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[13] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "# A[13] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[13] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[14] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[14] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[14] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[14] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[14] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[14] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[14] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[14] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "# A[14] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[14] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "# A[14] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[14] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[15] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[15] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[15] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[15] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[15] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[15] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[15] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[15] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "# A[15] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[15] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "# A[15] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[15] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	128(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[16] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[16] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[16] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[16] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[16] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[16] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[16] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[16] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[16] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[16] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[16] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[16] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "# A[16] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[16] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[16] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[16] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "# A[16] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[16] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[16] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[16] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "# A[16] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[16] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[16] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[16] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	136(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[17] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[17] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[17] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[17] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[17] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[17] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[17] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[17] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[17] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[17] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[17] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[17] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "# A[17] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[17] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[17] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[17] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "# A[17] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[17] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[17] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[17] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "# A[17] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[17] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[17] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[17] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	144(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[18] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[18] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[18] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[18] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[18] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[18] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[18] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[18] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[18] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[18] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[18] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[18] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "# A[18] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[18] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[18] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[18] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "# A[18] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[18] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[18] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[18] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "# A[18] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[18] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[18] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[18] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	152(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[19] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[19] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[19] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[19] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[19] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[19] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[19] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[19] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[19] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[19] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[19] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[19] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "# A[19] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[19] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[19] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[19] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "# A[19] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[19] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[19] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[19] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "# A[19] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[19] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[19] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[19] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	160(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[20] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[20] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[20] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[20] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[20] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[20] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[20] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[20] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "# A[20] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[20] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[20] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[20] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "# A[20] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[20] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[20] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[20] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "# A[20] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[20] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[20] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[20] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "movq	344(%[t]), %%r11\n\t"
        "# A[20] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[20] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[20] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[20] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	%%r12, 352(%[t])\n\t"
        "movq	168(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[21] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[21] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[21] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[21] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[21] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[21] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[21] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[21] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "# A[21] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[21] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[21] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[21] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "# A[21] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[21] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[21] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[21] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "# A[21] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[21] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[21] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[21] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "movq	344(%[t]), %%r11\n\t"
        "movq	352(%[t]), %%r12\n\t"
        "# A[21] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[21] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[21] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[21] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	%%r12, 352(%[t])\n\t"
        "movq	%%r13, 360(%[t])\n\t"
        "movq	176(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[22] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[22] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[22] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[22] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[22] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[22] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[22] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[22] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "# A[22] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[22] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[22] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[22] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "# A[22] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[22] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[22] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[22] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "# A[22] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[22] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[22] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[22] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	344(%[t]), %%r11\n\t"
        "movq	352(%[t]), %%r12\n\t"
        "movq	360(%[t]), %%r13\n\t"
        "# A[22] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[22] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[22] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[22] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	%%r12, 352(%[t])\n\t"
        "movq	%%r13, 360(%[t])\n\t"
        "movq	%%r14, 368(%[t])\n\t"
        "movq	184(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[23] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[23] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[23] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[23] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[23] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[23] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[23] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[23] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "# A[23] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[23] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[23] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[23] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "# A[23] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[23] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[23] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[23] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "movq	344(%[t]), %%r11\n\t"
        "# A[23] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[23] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[23] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[23] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	352(%[t]), %%r12\n\t"
        "movq	360(%[t]), %%r13\n\t"
        "movq	368(%[t]), %%r14\n\t"
        "# A[23] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[23] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[23] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[23] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	%%r12, 352(%[t])\n\t"
        "movq	%%r13, 360(%[t])\n\t"
        "movq	%%r14, 368(%[t])\n\t"
        "movq	%%rax, 376(%[t])\n\t"
        :
        : [a] "r" (a), [b] "r" (b), [t] "r" (tmp)
        : "memory", "rax", "rdx", "rcx",
          "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_avx2_24(sp_digit* r, const sp_digit* a)
{
    sp_digit tmp[48];

    __asm__ __volatile__ (
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 1\n\t"
        "xorq	%%r10, %%r10\n\t"
        "xorq	%%r11, %%r11\n\t"
        "xorq	%%r12, %%r12\n\t"
        "xorq	%%r13, %%r13\n\t"
        "xorq	%%r14, %%r14\n\t"
        "xorq	%%r15, %%r15\n\t"
        "# A[1] x A[0]\n\t"
        "movq	0(%[a]), %%rdx\n\t"
        "mulxq	8(%[a]), %%r10, %%r11\n\t"
        "# A[2] x A[0]\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[3] x A[0]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[4] x A[0]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[5] x A[0]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 8(%[tmp])\n\t"
        "movq	%%r11, 16(%[tmp])\n\t"
        "movq	%%r12, 24(%[tmp])\n\t"
        "movq	%%r13, 32(%[tmp])\n\t"
        "movq	%%r14, 40(%[tmp])\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "# A[6] x A[0]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[7] x A[0]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[0]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[9] x A[0]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[0]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 48(%[tmp])\n\t"
        "movq	%%r10, 56(%[tmp])\n\t"
        "movq	%%r11, 64(%[tmp])\n\t"
        "movq	%%r12, 72(%[tmp])\n\t"
        "movq	%%r13, 80(%[tmp])\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[11] x A[0]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[0]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[0]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[0]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[0]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 88(%[tmp])\n\t"
        "movq	%%r15, 96(%[tmp])\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r8, %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "# A[16] x A[0]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[17] x A[0]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[18] x A[0]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[19] x A[0]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[20] x A[0]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "# A[21] x A[0]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[22] x A[0]\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[23] x A[0]\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 2\n\t"
        "movq	24(%[tmp]), %%r15\n\t"
        "movq	32(%[tmp]), %%r10\n\t"
        "movq	40(%[tmp]), %%r11\n\t"
        "movq	48(%[tmp]), %%r12\n\t"
        "movq	56(%[tmp]), %%r13\n\t"
        "movq	64(%[tmp]), %%r14\n\t"
        "# A[2] x A[1]\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[3] x A[1]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[4] x A[1]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[5] x A[1]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[6] x A[1]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 24(%[tmp])\n\t"
        "movq	%%r10, 32(%[tmp])\n\t"
        "movq	%%r11, 40(%[tmp])\n\t"
        "movq	%%r12, 48(%[tmp])\n\t"
        "movq	%%r13, 56(%[tmp])\n\t"
        "movq	72(%[tmp]), %%r15\n\t"
        "movq	80(%[tmp]), %%r10\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "movq	96(%[tmp]), %%r12\n\t"
        "movq	104(%[tmp]), %%r13\n\t"
        "# A[7] x A[1]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[8] x A[1]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[1]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[10] x A[1]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[11] x A[1]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 64(%[tmp])\n\t"
        "movq	%%r15, 72(%[tmp])\n\t"
        "movq	%%r10, 80(%[tmp])\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	%%r12, 96(%[tmp])\n\t"
        "movq	112(%[tmp]), %%r14\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "# A[12] x A[1]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[13] x A[1]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[1]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[15] x A[1]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[16] x A[1]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 104(%[tmp])\n\t"
        "movq	%%r14, 112(%[tmp])\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "movq	168(%[tmp]), %%r15\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "# A[17] x A[1]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[18] x A[1]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[19] x A[1]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[20] x A[1]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[21] x A[1]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "# A[22] x A[1]\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[23] x A[1]\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[23] x A[2]\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	%%r12, 192(%[tmp])\n\t"
        "movq	%%r13, 200(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r14\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r14, 208(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 3\n\t"
        "movq	40(%[tmp]), %%r14\n\t"
        "movq	48(%[tmp]), %%r15\n\t"
        "movq	56(%[tmp]), %%r10\n\t"
        "movq	64(%[tmp]), %%r11\n\t"
        "movq	72(%[tmp]), %%r12\n\t"
        "movq	80(%[tmp]), %%r13\n\t"
        "# A[3] x A[2]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[4] x A[2]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[5] x A[2]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[6] x A[2]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[7] x A[2]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 40(%[tmp])\n\t"
        "movq	%%r15, 48(%[tmp])\n\t"
        "movq	%%r10, 56(%[tmp])\n\t"
        "movq	%%r11, 64(%[tmp])\n\t"
        "movq	%%r12, 72(%[tmp])\n\t"
        "movq	88(%[tmp]), %%r14\n\t"
        "movq	96(%[tmp]), %%r15\n\t"
        "movq	104(%[tmp]), %%r10\n\t"
        "movq	112(%[tmp]), %%r11\n\t"
        "movq	120(%[tmp]), %%r12\n\t"
        "# A[8] x A[2]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[2]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[10] x A[2]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[11] x A[2]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[12] x A[2]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 80(%[tmp])\n\t"
        "movq	%%r14, 88(%[tmp])\n\t"
        "movq	%%r15, 96(%[tmp])\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	128(%[tmp]), %%r13\n\t"
        "movq	136(%[tmp]), %%r14\n\t"
        "movq	144(%[tmp]), %%r15\n\t"
        "movq	152(%[tmp]), %%r10\n\t"
        "movq	160(%[tmp]), %%r11\n\t"
        "# A[13] x A[2]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[14] x A[2]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[2]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[16] x A[2]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[17] x A[2]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "# A[18] x A[2]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[19] x A[2]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[20] x A[2]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[21] x A[2]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[22] x A[2]\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[22] x A[3]\n\t"
        "movq	176(%[a]), %%rdx\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[22] x A[4]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[22] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 4\n\t"
        "movq	56(%[tmp]), %%r13\n\t"
        "movq	64(%[tmp]), %%r14\n\t"
        "movq	72(%[tmp]), %%r15\n\t"
        "movq	80(%[tmp]), %%r10\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "movq	96(%[tmp]), %%r12\n\t"
        "# A[4] x A[3]\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[5] x A[3]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[6] x A[3]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[7] x A[3]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[3]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 56(%[tmp])\n\t"
        "movq	%%r14, 64(%[tmp])\n\t"
        "movq	%%r15, 72(%[tmp])\n\t"
        "movq	%%r10, 80(%[tmp])\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	104(%[tmp]), %%r13\n\t"
        "movq	112(%[tmp]), %%r14\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "# A[9] x A[3]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[3]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[11] x A[3]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[3]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[3]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 96(%[tmp])\n\t"
        "movq	%%r13, 104(%[tmp])\n\t"
        "movq	%%r14, 112(%[tmp])\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "movq	168(%[tmp]), %%r15\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "# A[14] x A[3]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[3]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[16] x A[3]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[17] x A[3]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[18] x A[3]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "movq	200(%[tmp]), %%r13\n\t"
        "movq	208(%[tmp]), %%r14\n\t"
        "movq	216(%[tmp]), %%r15\n\t"
        "# A[19] x A[3]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[20] x A[3]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[21] x A[3]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[21] x A[4]\n\t"
        "movq	168(%[a]), %%rdx\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[21] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	%%r12, 192(%[tmp])\n\t"
        "movq	%%r13, 200(%[tmp])\n\t"
        "movq	%%r14, 208(%[tmp])\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "# A[21] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[21] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[21] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r15, 216(%[tmp])\n\t"
        "movq	%%r10, 224(%[tmp])\n\t"
        "movq	%%r11, 232(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r12\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r12, 240(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 5\n\t"
        "movq	72(%[tmp]), %%r12\n\t"
        "movq	80(%[tmp]), %%r13\n\t"
        "movq	88(%[tmp]), %%r14\n\t"
        "movq	96(%[tmp]), %%r15\n\t"
        "movq	104(%[tmp]), %%r10\n\t"
        "movq	112(%[tmp]), %%r11\n\t"
        "# A[5] x A[4]\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[6] x A[4]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[7] x A[4]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[8] x A[4]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[4]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 72(%[tmp])\n\t"
        "movq	%%r13, 80(%[tmp])\n\t"
        "movq	%%r14, 88(%[tmp])\n\t"
        "movq	%%r15, 96(%[tmp])\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	120(%[tmp]), %%r12\n\t"
        "movq	128(%[tmp]), %%r13\n\t"
        "movq	136(%[tmp]), %%r14\n\t"
        "movq	144(%[tmp]), %%r15\n\t"
        "movq	152(%[tmp]), %%r10\n\t"
        "# A[10] x A[4]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[11] x A[4]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[12] x A[4]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[13] x A[4]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[4]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	160(%[tmp]), %%r11\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "# A[15] x A[4]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[16] x A[4]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[17] x A[4]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[18] x A[4]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[19] x A[4]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "movq	216(%[tmp]), %%r12\n\t"
        "movq	224(%[tmp]), %%r13\n\t"
        "movq	232(%[tmp]), %%r14\n\t"
        "# A[20] x A[4]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[20] x A[5]\n\t"
        "movq	160(%[a]), %%rdx\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[20] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[20] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[20] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "movq	240(%[tmp]), %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "# A[20] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[20] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[20] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r14, 232(%[tmp])\n\t"
        "movq	%%r15, 240(%[tmp])\n\t"
        "movq	%%r10, 248(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r11\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r11, 256(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 6\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "movq	96(%[tmp]), %%r12\n\t"
        "movq	104(%[tmp]), %%r13\n\t"
        "movq	112(%[tmp]), %%r14\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "# A[6] x A[5]\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[7] x A[5]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[8] x A[5]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[5]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[10] x A[5]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	%%r12, 96(%[tmp])\n\t"
        "movq	%%r13, 104(%[tmp])\n\t"
        "movq	%%r14, 112(%[tmp])\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "movq	168(%[tmp]), %%r15\n\t"
        "# A[11] x A[5]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[12] x A[5]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[5]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[14] x A[5]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[5]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "movq	200(%[tmp]), %%r13\n\t"
        "movq	208(%[tmp]), %%r14\n\t"
        "# A[16] x A[5]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[17] x A[5]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[18] x A[5]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[19] x A[5]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[19] x A[6]\n\t"
        "movq	152(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	%%r12, 192(%[tmp])\n\t"
        "movq	%%r13, 200(%[tmp])\n\t"
        "movq	216(%[tmp]), %%r15\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "movq	232(%[tmp]), %%r11\n\t"
        "movq	240(%[tmp]), %%r12\n\t"
        "movq	248(%[tmp]), %%r13\n\t"
        "# A[19] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[19] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[19] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[19] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[19] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 208(%[tmp])\n\t"
        "movq	%%r15, 216(%[tmp])\n\t"
        "movq	%%r10, 224(%[tmp])\n\t"
        "movq	%%r11, 232(%[tmp])\n\t"
        "movq	%%r12, 240(%[tmp])\n\t"
        "movq	256(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[19] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[19] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[19] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r13, 248(%[tmp])\n\t"
        "movq	%%r14, 256(%[tmp])\n\t"
        "movq	%%r15, 264(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 272(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 7\n\t"
        "movq	104(%[tmp]), %%r10\n\t"
        "movq	112(%[tmp]), %%r11\n\t"
        "movq	120(%[tmp]), %%r12\n\t"
        "movq	128(%[tmp]), %%r13\n\t"
        "movq	136(%[tmp]), %%r14\n\t"
        "movq	144(%[tmp]), %%r15\n\t"
        "# A[7] x A[6]\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[6]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[9] x A[6]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[6]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[11] x A[6]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	152(%[tmp]), %%r10\n\t"
        "movq	160(%[tmp]), %%r11\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "# A[12] x A[6]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[6]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[6]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[6]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[16] x A[6]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "movq	216(%[tmp]), %%r12\n\t"
        "movq	224(%[tmp]), %%r13\n\t"
        "# A[17] x A[6]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[18] x A[6]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[18] x A[7]\n\t"
        "movq	144(%[a]), %%rdx\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[18] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[18] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "movq	232(%[tmp]), %%r14\n\t"
        "movq	240(%[tmp]), %%r15\n\t"
        "movq	248(%[tmp]), %%r10\n\t"
        "movq	256(%[tmp]), %%r11\n\t"
        "movq	264(%[tmp]), %%r12\n\t"
        "# A[18] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[18] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[18] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[18] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[18] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "movq	%%r14, 232(%[tmp])\n\t"
        "movq	%%r15, 240(%[tmp])\n\t"
        "movq	%%r10, 248(%[tmp])\n\t"
        "movq	%%r11, 256(%[tmp])\n\t"
        "movq	272(%[tmp]), %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "# A[18] x A[15]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[18] x A[16]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[18] x A[17]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r12, 264(%[tmp])\n\t"
        "movq	%%r13, 272(%[tmp])\n\t"
        "movq	%%r14, 280(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r15\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r15, 288(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 8\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "# A[8] x A[7]\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[7]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[10] x A[7]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[11] x A[7]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[12] x A[7]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	168(%[tmp]), %%r15\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "movq	200(%[tmp]), %%r13\n\t"
        "# A[13] x A[7]\n\t"
