From 72e7f011b29998d8a3e15eb5b381ef962af5fe5b Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Fri, 5 Apr 2019 10:30:58 +0300
Subject: Upgrade to 8.0.15

---
 mysql/extra/yassl/taocrypt/src/integer.cpp | 3894 ----------------------------
 1 file changed, 3894 deletions(-)
 delete mode 100644 mysql/extra/yassl/taocrypt/src/integer.cpp

(limited to 'mysql/extra/yassl/taocrypt/src/integer.cpp')
diff --git a/mysql/extra/yassl/taocrypt/src/integer.cpp b/mysql/extra/yassl/taocrypt/src/integer.cpp
deleted file mode 100644
index 478a13c..0000000
--- a/mysql/extra/yassl/taocrypt/src/integer.cpp
+++ /dev/null
@@ -1,3894 +0,0 @@
-/* Copyright (c) 2005, 2012, Oracle and/or its affiliates. All rights reserved.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; version 2 of the License.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
-
-/* based on Wei Dai's integer.cpp from CryptoPP */
-
-#include "runtime.hpp"
-#include "integer.hpp"
-#include "modarith.hpp"
-#include "asn.hpp"
-
-
-
-#ifdef __DECCXX
-    #include <c_asm.h>  // for asm overflow assembly
-#endif
-
-#if defined(_M_X64) || defined(_M_IA64)
-    #include <intrin.h> 
-#pragma intrinsic(_umul128)
-#endif
-
-
-#ifdef __GNUC__
-    #include <signal.h>
-    #include <setjmp.h>
-#endif
-
-
-#ifdef SSE2_INTRINSICS_AVAILABLE
-    #ifdef __GNUC__
-        #include <xmmintrin.h>
-        #ifdef TAOCRYPT_MEMALIGN_AVAILABLE
-            #include <malloc.h>
-        #else
-            #include <stdlib.h>
-        #endif
-    #else
-        #include <emmintrin.h>
-    #endif
-#elif defined(_MSC_VER) && defined(_M_IX86)
-/*    #pragma message("You do not seem to have the Visual C++ Processor Pack ")
-     #pragma message("installed, so use of SSE2 intrinsics will be disabled.")
-*/
-    #pragma message("installed, so use of SSE2 intrinsics will be disabled.")
-#elif defined(__GNUC__) && defined(__i386__)
-/*   #warning You do not have GCC 3.3 or later, or did not specify the -msse2 \
-             compiler option. Use of SSE2 intrinsics will be disabled.
-*/
-#endif
-
-
-namespace TaoCrypt {
-
-
-#ifdef SSE2_INTRINSICS_AVAILABLE
-
-template <class T>
-CPP_TYPENAME AlignedAllocator<T>::pointer AlignedAllocator<T>::allocate(
-                                           size_type n, const void *)
-{
-    if (n > this->max_size())
-        return 0;
-    if (n == 0)
-        return 0;
-    if (n >= 4)
-    {
-        void* p;
-    #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
-        p = _mm_malloc(sizeof(T)*n, 16);
-    #elif defined(TAOCRYPT_MEMALIGN_AVAILABLE)
-        p = memalign(16, sizeof(T)*n);
-    #elif defined(TAOCRYPT_MALLOC_ALIGNMENT_IS_16)
-        p = malloc(sizeof(T)*n);
-    #else
-        p = (byte *)malloc(sizeof(T)*n + 8);
-        // assume malloc alignment is at least 8
-    #endif
-
-    #ifdef TAOCRYPT_NO_ALIGNED_ALLOC
-        m_pBlock = p;
-        if (!IsAlignedOn(p, 16))
-        {
-            p = (byte *)p + 8;
-        }
-    #endif
-
-        return (T*)p;
-    }
-    return NEW_TC T[n];
-}
-
-
-template <class T>
-void AlignedAllocator<T>::deallocate(void* p, size_type n)
-{
-    memset(p, 0, n*sizeof(T));
-    if (n >= 4)
-    {
-        #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
-            _mm_free(p);
-        #elif defined(TAOCRYPT_NO_ALIGNED_ALLOC)
-            free(m_pBlock);
-            m_pBlock = 0;
-        #else
-            free(p);
-        #endif
-    }
-    else
-        tcArrayDelete((T *)p);
-}
-
-#endif  // SSE2
-
-
-// ********  start of integer needs
-
-// start 5.2.1 adds DWord and Word ********
-
-// ********************************************************
-
-class DWord {
-public:
-DWord() {}
-
-#ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-    explicit DWord(word low)
-    {
-        whole_ = low;
-    }
-#else
-    explicit DWord(word low)
-    {
-        halfs_.low = low;
-        halfs_.high = 0;
-    }
-#endif
-
-    DWord(word low, word high)
-    {
-        halfs_.low = low;
-        halfs_.high = high;
-    }
-
-    static DWord Multiply(word a, word b)
-    {
-        DWord r;
-
-        #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-            r.whole_ = (dword)a * b;
-
-        #elif defined(_M_X64) || defined(_M_IA64)
-            r.halfs_.low = _umul128(a, b, &r.halfs_.high);
-
-        #elif defined(__ia64__)
-            r.halfs_.low = a*b;
-            __asm__("xmpy.hu %0=%1,%2" : "=f" (r.halfs_.high)
-                : "f" (a), "f" (b));
-
-        #elif defined(_ARCH_PPC64)
-            r.halfs_.low = a*b;
-            __asm__("mulhdu %0,%1,%2" : "=r" (r.halfs_.high)
-                : "r" (a), "r" (b) : "cc");
-
-        #elif defined(__x86_64__)
-            __asm__("mulq %3" : "=d" (r.halfs_.high), "=a" (r.halfs_.low) :
-                "a" (a), "rm" (b) : "cc");
-
-        #elif defined(__mips64)
-            __asm__("dmultu %2,%3" : "=h" (r.halfs_.high), "=l" (r.halfs_.low)
-                : "r" (a), "r" (b));
-
-        #elif defined(_M_IX86)
-            // for testing
-            word64 t = (word64)a * b;
-            r.halfs_.high = ((word32 *)(&t))[1];
-            r.halfs_.low = (word32)t;
-        #else
-            #error can not implement DWord
-        #endif
-
-        return r;
-    }
-
-    static DWord MultiplyAndAdd(word a, word b, word c)
-    {
-        DWord r = Multiply(a, b);
-        return r += c;
-    }
-
-    DWord & operator+=(word a)
-    {
-        #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-            whole_ = whole_ + a;
-        #else
-            halfs_.low += a;
-            halfs_.high += (halfs_.low < a);
-        #endif
-        return *this;
-    }
-
-    DWord operator+(word a)
-    {
-        DWord r;
-        #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-            r.whole_ = whole_ + a;
-        #else
-            r.halfs_.low = halfs_.low + a;
-            r.halfs_.high = halfs_.high + (r.halfs_.low < a);
-        #endif
-        return r;
-    }
-
-    DWord operator-(DWord a)
-    {
-        DWord r;
-        #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-            r.whole_ = whole_ - a.whole_;
-        #else
-            r.halfs_.low = halfs_.low - a.halfs_.low;
-            r.halfs_.high = halfs_.high - a.halfs_.high -
-                             (r.halfs_.low > halfs_.low);
-        #endif
-        return r;
-    }
-
-    DWord operator-(word a)
-    {
-        DWord r;
-        #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-            r.whole_ = whole_ - a;
-        #else
-            r.halfs_.low = halfs_.low - a;
-            r.halfs_.high = halfs_.high - (r.halfs_.low > halfs_.low);
-        #endif
-        return r;
-    }
-
-    // returns quotient, which must fit in a word
-    word operator/(word divisor);
-
-    word operator%(word a);
-
-    bool operator!() const
-    {
-    #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-        return !whole_;
-    #else
-        return !halfs_.high && !halfs_.low;
-    #endif
-    }
-
-    word GetLowHalf() const {return halfs_.low;}
-    word GetHighHalf() const {return halfs_.high;}
-    word GetHighHalfAsBorrow() const {return 0-halfs_.high;}
-
-private:
-    struct dword_struct
-        {
-        #ifdef LITTLE_ENDIAN_ORDER
-            word low;
-            word high;
-        #else
-            word high;
-            word low;
-        #endif
-    };
-
-    union
-    {
-    #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-        dword whole_;
-    #endif
-        struct dword_struct halfs_;
-    };
-};
-
-
-class Word {
-public:
-    Word() {}
-
-    Word(word value)
-    {
-        whole_ = value;
-    }
-
-    Word(hword low, hword high)
-    {
-        whole_ = low | (word(high) << (WORD_BITS/2));
-    }
-
-    static Word Multiply(hword a, hword b)
-    {
-        Word r;
-        r.whole_ = (word)a * b;
-        return r;
-    }
-
-    Word operator-(Word a)
-    {
-        Word r;
-        r.whole_ = whole_ - a.whole_;
-        return r;
-    }
-
-    Word operator-(hword a)
-    {
-        Word r;
-        r.whole_ = whole_ - a;
-        return r;
-    }
-
-    // returns quotient, which must fit in a word
-    hword operator/(hword divisor)
-    {
-        return hword(whole_ / divisor);
-    }
-
-    bool operator!() const
-    {
-        return !whole_;
-    }
-
-    word GetWhole() const {return whole_;}
-    hword GetLowHalf() const {return hword(whole_);}
-    hword GetHighHalf() const {return hword(whole_>>(WORD_BITS/2));}
-    hword GetHighHalfAsBorrow() const {return 0-hword(whole_>>(WORD_BITS/2));}
-
-private:
-    word whole_;
-};
-
-
-// dummy is VC60 compiler bug workaround
-// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
-template <class S, class D>
-S DivideThreeWordsByTwo(S* A, S B0, S B1, D* dummy_VC6_WorkAround = 0)
-{
-    // estimate the quotient: do a 2 S by 1 S divide
-    S Q;
-    if (S(B1+1) == 0)
-        Q = A[2];
-    else
-        Q = D(A[1], A[2]) / S(B1+1);
-
-    // now subtract Q*B from A
-    D p = D::Multiply(B0, Q);
-    D u = (D) A[0] - p.GetLowHalf();
-    A[0] = u.GetLowHalf();
-    u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - 
-            D::Multiply(B1, Q);
-    A[1] = u.GetLowHalf();
-    A[2] += u.GetHighHalf();
-
-    // Q <= actual quotient, so fix it
-    while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
-    {
-        u = (D) A[0] - B0;
-        A[0] = u.GetLowHalf();
-        u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
-        A[1] = u.GetLowHalf();
-        A[2] += u.GetHighHalf();
-        Q++;
-    }
-
-    return Q;
-}
-
-
-// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
-template <class S, class D>
-inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
-{
-    if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
-        return D(Ah.GetLowHalf(), Ah.GetHighHalf());
-    else
-    {
-        S Q[2];
-        T[0] = Al.GetLowHalf();
-        T[1] = Al.GetHighHalf(); 
-        T[2] = Ah.GetLowHalf();
-        T[3] = Ah.GetHighHalf();
-        Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(),
-                                                B.GetHighHalf());
-        Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
-        return D(Q[0], Q[1]);
-    }
-}
-
-
-// returns quotient, which must fit in a word
-inline word DWord::operator/(word a)
-{
-    #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-        return word(whole_ / a);
-    #else
-        hword r[4];
-        return DivideFourWordsByTwo<hword, Word>(r, halfs_.low,
-                                                    halfs_.high, a).GetWhole();
-    #endif
-}
-
-inline word DWord::operator%(word a)
-{
-    #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-        return word(whole_ % a);
-    #else
-        if (a < (word(1) << (WORD_BITS/2)))
-        {
-            hword h = hword(a);
-            word r = halfs_.high % h;
-            r = ((halfs_.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
-            return hword((hword(halfs_.low) + (r << (WORD_BITS/2))) % h);
-        }
-        else
-        {
-            hword r[4];
-            DivideFourWordsByTwo<hword, Word>(r, halfs_.low, halfs_.high, a);
-            return Word(r[0], r[1]).GetWhole();
-        }
-    #endif
-}
-
-
-
-// end 5.2.1 DWord and Word adds
-
-
-
-
-
-static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
-
-static inline unsigned int RoundupSize(unsigned int n)
-{
-    if (n<=8)
-        return RoundupSizeTable[n];
-    else if (n<=16)
-        return 16;
-    else if (n<=32)
-        return 32;
-    else if (n<=64)
-        return 64;
-    else return 1U << BitPrecision(n-1);
-}
-
-
-static int Compare(const word *A, const word *B, unsigned int N)
-{
-    while (N--)
-        if (A[N] > B[N])
-            return 1;
-        else if (A[N] < B[N])
-            return -1;
-
-    return 0;
-}
-
-static word Increment(word *A, unsigned int N, word B=1)
-{
-    word t = A[0];
-    A[0] = t+B;
-    if (A[0] >= t)
-        return 0;
-    for (unsigned i=1; i<N; i++)
-        if (++A[i])
-            return 0;
-    return 1;
-}
-
-static word Decrement(word *A, unsigned int N, word B=1)
-{
-    word t = A[0];
-    A[0] = t-B;
-    if (A[0] <= t)
-        return 0;
-    for (unsigned i=1; i<N; i++)
-        if (A[i]--)
-            return 0;
-    return 1;
-}
-
-static void TwosComplement(word *A, unsigned int N)
-{
-    Decrement(A, N);
-    for (unsigned i=0; i<N; i++)
-        A[i] = ~A[i];
-}
-
-
-static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
-{
-    word carry=0;
-    for(unsigned i=0; i<N; i++)
-    {
-        DWord p = DWord::MultiplyAndAdd(A[i], B, carry);
-        C[i] = p.GetLowHalf();
-        carry = p.GetHighHalf();
-    }
-    return carry;
-}
-
-
-static word AtomicInverseModPower2(word A)
-{
-    word R=A%8;
-
-    for (unsigned i=3; i<WORD_BITS; i*=2)
-        R = R*(2-R*A);
-
-    return R;
-}
-
-
-// ********************************************************
-
-class Portable
-{
-public:
-    static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
-                                   unsigned int N);
-    static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word*B,
-                                        unsigned int N);
-    static void TAOCRYPT_CDECL Multiply2(word *C, const word *A, const word *B);
-    static word TAOCRYPT_CDECL Multiply2Add(word *C,
-                                            const word *A, const word *B);
-    static void TAOCRYPT_CDECL Multiply4(word *C, const word *A, const word *B);
-    static void TAOCRYPT_CDECL Multiply8(word *C, const word *A, const word *B);
-    static unsigned int TAOCRYPT_CDECL MultiplyRecursionLimit() {return 8;}
-
-    static void TAOCRYPT_CDECL Multiply2Bottom(word *C, const word *A,
-                                               const word *B);
-    static void TAOCRYPT_CDECL Multiply4Bottom(word *C, const word *A,
-                                               const word *B);
-    static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
-                                               const word *B);
-    static unsigned int TAOCRYPT_CDECL MultiplyBottomRecursionLimit(){return 8;}
-
-    static void TAOCRYPT_CDECL Square2(word *R, const word *A);
-    static void TAOCRYPT_CDECL Square4(word *R, const word *A);
-    static unsigned int TAOCRYPT_CDECL SquareRecursionLimit() {return 4;}
-};
-
-word Portable::Add(word *C, const word *A, const word *B, unsigned int N)
-{
-    DWord u(0, 0);
-    for (unsigned int i = 0; i < N; i+=2)
-    {
-        u = DWord(A[i]) + B[i] + u.GetHighHalf();
-        C[i] = u.GetLowHalf();
-        u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
-        C[i+1] = u.GetLowHalf();
-    }
-    return u.GetHighHalf();
-}
-
-word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N)
-{
-    DWord u(0, 0);
-    for (unsigned int i = 0; i < N; i+=2)
-    {
-        u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
-        C[i] = u.GetLowHalf();
-        u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
-        C[i+1] = u.GetLowHalf();
-    }
-    return 0-u.GetHighHalf();
-}
-
-void Portable::Multiply2(word *C, const word *A, const word *B)
-{
-/*
-    word s;
-    dword d;
-
-    if (A1 >= A0)
-        if (B0 >= B1)
-        {
-            s = 0;
-            d = (dword)(A1-A0)*(B0-B1);
-        }
-        else
-        {
-            s = (A1-A0);
-            d = (dword)s*(word)(B0-B1);
-        }
-    else
-        if (B0 > B1)
-        {
-            s = (B0-B1);
-            d = (word)(A1-A0)*(dword)s;
-        }
-        else
-        {
-            s = 0;
-            d = (dword)(A0-A1)*(B1-B0);
-        }
-*/
-    // this segment is the branchless equivalent of above
-    word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
-    unsigned int ai = A[1] < A[0];
-    unsigned int bi = B[0] < B[1];
-    unsigned int di = ai & bi;
-    DWord d = DWord::Multiply(D[di], D[di+2]);
-    D[1] = D[3] = 0;
-    unsigned int si = ai + !bi;
-    word s = D[si];
-
-    DWord A0B0 = DWord::Multiply(A[0], B[0]);
-    C[0] = A0B0.GetLowHalf();
-
-    DWord A1B1 = DWord::Multiply(A[1], B[1]);
-    DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf()
-                       + A1B1.GetLowHalf();
-    C[1] = t.GetLowHalf();
-
-    t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf()
-             + A1B1.GetHighHalf() - s;
-    C[2] = t.GetLowHalf();
-    C[3] = t.GetHighHalf();
-}
-
-void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
-{
-    DWord t = DWord::Multiply(A[0], B[0]);
-    C[0] = t.GetLowHalf();
-    C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
-}
-
-word Portable::Multiply2Add(word *C, const word *A, const word *B)
-{
-    word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
-    unsigned int ai = A[1] < A[0];
-    unsigned int bi = B[0] < B[1];
-    unsigned int di = ai & bi;
-    DWord d = DWord::Multiply(D[di], D[di+2]);
-    D[1] = D[3] = 0;
-    unsigned int si = ai + !bi;
-    word s = D[si];
-
-    DWord A0B0 = DWord::Multiply(A[0], B[0]);
-    DWord t = A0B0 + C[0];
-    C[0] = t.GetLowHalf();
-
-    DWord A1B1 = DWord::Multiply(A[1], B[1]);
-    t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() +
-        A1B1.GetLowHalf() + C[1];
-    C[1] = t.GetLowHalf();
-
-    t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() +
-        d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
-    C[2] = t.GetLowHalf();
-
-    t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
-    C[3] = t.GetLowHalf();
-    return t.GetHighHalf();
-}
-
-
-#define MulAcc(x, y)                                \
-    p = DWord::MultiplyAndAdd(A[x], B[y], c);       \
-    c = p.GetLowHalf();                             \
-    p = (DWord) d + p.GetHighHalf();                \
-    d = p.GetLowHalf();                             \
-    e += p.GetHighHalf();
-
-#define SaveMulAcc(s, x, y)                         \
-    R[s] = c;                                       \
-    p = DWord::MultiplyAndAdd(A[x], B[y], d);       \
-    c = p.GetLowHalf();                             \
-    p = (DWord) e + p.GetHighHalf();                \
-    d = p.GetLowHalf();                             \
-    e = p.GetHighHalf();
-
-#define SquAcc(x, y)                                \
-    q = DWord::Multiply(A[x], A[y]);                \
-    p = q + c;                                      \
-    c = p.GetLowHalf();                             \
-    p = (DWord) d + p.GetHighHalf();                \
-    d = p.GetLowHalf();                             \
-    e += p.GetHighHalf();                           \
-    p = q + c;                                      \
-    c = p.GetLowHalf();                             \
-    p = (DWord) d + p.GetHighHalf();                \
-    d = p.GetLowHalf();                             \
-    e += p.GetHighHalf();
-
-#define SaveSquAcc(s, x, y)                         \
-    R[s] = c;                                       \
-    q = DWord::Multiply(A[x], A[y]);                \
-    p = q + d;                                      \
-    c = p.GetLowHalf();                             \
-    p = (DWord) e + p.GetHighHalf();                \
-    d = p.GetLowHalf();                             \
-    e = p.GetHighHalf();                            \
-    p = q + c;                                      \
-    c = p.GetLowHalf();                             \
-    p = (DWord) d + p.GetHighHalf();                \
-    d = p.GetLowHalf();                             \
-    e += p.GetHighHalf();
-
-
-void Portable::Multiply4(word *R, const word *A, const word *B)
-{
-    DWord p;
-    word c, d, e;
-
-    p = DWord::Multiply(A[0], B[0]);
-    R[0] = p.GetLowHalf();
-    c = p.GetHighHalf();
-    d = e = 0;
-
-    MulAcc(0, 1);
-    MulAcc(1, 0);
-
-    SaveMulAcc(1, 2, 0);
-    MulAcc(1, 1);
-    MulAcc(0, 2);
-
-    SaveMulAcc(2, 0, 3);
-    MulAcc(1, 2);
-    MulAcc(2, 1);
-    MulAcc(3, 0);
-
-    SaveMulAcc(3, 3, 1);
-    MulAcc(2, 2);
-    MulAcc(1, 3);
-
-    SaveMulAcc(4, 2, 3);
-    MulAcc(3, 2);
-
-    R[5] = c;
-    p = DWord::MultiplyAndAdd(A[3], B[3], d);
-    R[6] = p.GetLowHalf();
-    R[7] = e + p.GetHighHalf();
-}
-
-void Portable::Square2(word *R, const word *A)
-{
-    DWord p, q;
-    word c, d, e;
-
-    p = DWord::Multiply(A[0], A[0]);
-    R[0] = p.GetLowHalf();
-    c = p.GetHighHalf();
-    d = e = 0;
-
-    SquAcc(0, 1);
-
-    R[1] = c;
-    p = DWord::MultiplyAndAdd(A[1], A[1], d);
-    R[2] = p.GetLowHalf();
-    R[3] = e + p.GetHighHalf();
-}
-
-void Portable::Square4(word *R, const word *A)
-{
-#ifdef _MSC_VER
-    // VC60 workaround: MSVC 6.0 has an optimization bug that makes
-    // (dword)A*B where either A or B has been cast to a dword before
-    // very expensive. Revisit this function when this
-    // bug is fixed.
-    Multiply4(R, A, A);
-#else
-    const word *B = A;
-    DWord p, q;
-    word c, d, e;
-
-    p = DWord::Multiply(A[0], A[0]);
-    R[0] = p.GetLowHalf();
-    c = p.GetHighHalf();
-    d = e = 0;
-
-    SquAcc(0, 1);
-
-    SaveSquAcc(1, 2, 0);
-    MulAcc(1, 1);
-
-    SaveSquAcc(2, 0, 3);
-    SquAcc(1, 2);
-
-    SaveSquAcc(3, 3, 1);
-    MulAcc(2, 2);
-
-    SaveSquAcc(4, 2, 3);
-
-    R[5] = c;
-    p = DWord::MultiplyAndAdd(A[3], A[3], d);
-    R[6] = p.GetLowHalf();
-    R[7] = e + p.GetHighHalf();
-#endif
-}
-
-void Portable::Multiply8(word *R, const word *A, const word *B)
-{
-    DWord p;
-    word c, d, e;
-
-    p = DWord::Multiply(A[0], B[0]);
-    R[0] = p.GetLowHalf();
-    c = p.GetHighHalf();
-    d = e = 0;
-
-    MulAcc(0, 1);
-    MulAcc(1, 0);
-
-    SaveMulAcc(1, 2, 0);
-    MulAcc(1, 1);
-    MulAcc(0, 2);
-
-    SaveMulAcc(2, 0, 3);
-    MulAcc(1, 2);
-    MulAcc(2, 1);
-    MulAcc(3, 0);
-
-    SaveMulAcc(3, 0, 4);
-    MulAcc(1, 3);
-    MulAcc(2, 2);
-    MulAcc(3, 1);
-    MulAcc(4, 0);
-
-    SaveMulAcc(4, 0, 5);
-    MulAcc(1, 4);
-    MulAcc(2, 3);
-    MulAcc(3, 2);
-    MulAcc(4, 1);
-    MulAcc(5, 0);
-
-    SaveMulAcc(5, 0, 6);
-    MulAcc(1, 5);
-    MulAcc(2, 4);
-    MulAcc(3, 3);
-    MulAcc(4, 2);
-    MulAcc(5, 1);
-    MulAcc(6, 0);
-
-    SaveMulAcc(6, 0, 7);
-    MulAcc(1, 6);
-    MulAcc(2, 5);
-    MulAcc(3, 4);
-    MulAcc(4, 3);
-    MulAcc(5, 2);
-    MulAcc(6, 1);
-    MulAcc(7, 0);
-
-    SaveMulAcc(7, 1, 7);
-    MulAcc(2, 6);
-    MulAcc(3, 5);
-    MulAcc(4, 4);
-    MulAcc(5, 3);
-    MulAcc(6, 2);
-    MulAcc(7, 1);
-
-    SaveMulAcc(8, 2, 7);
-    MulAcc(3, 6);
-    MulAcc(4, 5);
-    MulAcc(5, 4);
-    MulAcc(6, 3);
-    MulAcc(7, 2);
-
-    SaveMulAcc(9, 3, 7);
-    MulAcc(4, 6);
-    MulAcc(5, 5);
-    MulAcc(6, 4);
-    MulAcc(7, 3);
-
-    SaveMulAcc(10, 4, 7);
-    MulAcc(5, 6);
-    MulAcc(6, 5);
-    MulAcc(7, 4);
-
-    SaveMulAcc(11, 5, 7);
-    MulAcc(6, 6);
-    MulAcc(7, 5);
-
-    SaveMulAcc(12, 6, 7);
-    MulAcc(7, 6);
-
-    R[13] = c;
-    p = DWord::MultiplyAndAdd(A[7], B[7], d);
-    R[14] = p.GetLowHalf();
-    R[15] = e + p.GetHighHalf();
-}
-
-void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
-{
-    DWord p;
-    word c, d, e;
-
-    p = DWord::Multiply(A[0], B[0]);
-    R[0] = p.GetLowHalf();
-    c = p.GetHighHalf();
-    d = e = 0;
-
-    MulAcc(0, 1);
-    MulAcc(1, 0);
-
-    SaveMulAcc(1, 2, 0);
-    MulAcc(1, 1);
-    MulAcc(0, 2);
-
-    R[2] = c;
-    R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
-}
-
-void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
-{
-    DWord p;
-    word c, d, e;
-
-    p = DWord::Multiply(A[0], B[0]);
-    R[0] = p.GetLowHalf();
-    c = p.GetHighHalf();
-    d = e = 0;
-
-    MulAcc(0, 1);
-    MulAcc(1, 0);
-
-    SaveMulAcc(1, 2, 0);
-    MulAcc(1, 1);
-    MulAcc(0, 2);
-
-    SaveMulAcc(2, 0, 3);
-    MulAcc(1, 2);
-    MulAcc(2, 1);
-    MulAcc(3, 0);
-
-    SaveMulAcc(3, 0, 4);
-    MulAcc(1, 3);
-    MulAcc(2, 2);
-    MulAcc(3, 1);
-    MulAcc(4, 0);
-
-    SaveMulAcc(4, 0, 5);
-    MulAcc(1, 4);
-    MulAcc(2, 3);
-    MulAcc(3, 2);
-    MulAcc(4, 1);
-    MulAcc(5, 0);
-
-    SaveMulAcc(5, 0, 6);
-    MulAcc(1, 5);
-    MulAcc(2, 4);
-    MulAcc(3, 3);
-    MulAcc(4, 2);
-    MulAcc(5, 1);
-    MulAcc(6, 0);
-
-    R[6] = c;
-    R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
-               A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
-}
-
-
-#undef MulAcc
-#undef SaveMulAcc
-#undef SquAcc
-#undef SaveSquAcc
-
-// optimized
-
-#ifdef TAOCRYPT_X86ASM_AVAILABLE
-
-// ************** x86 feature detection ***************
-
-
-#ifdef SSE2_INTRINSICS_AVAILABLE
-
-#ifndef _MSC_VER
-    static jmp_buf s_env;
-    static void SigIllHandler(int)
-    {
-        longjmp(s_env, 1);
-    }
-#endif
-
-static bool HasSSE2()
-{
-    if (!IsPentium())
-        return false;
-
-    word32 cpuid[4];
-    CpuId(1, cpuid);
-    if ((cpuid[3] & (1 << 26)) == 0)
-        return false;
-
-#ifdef _MSC_VER
-    __try
-    {
-        __asm xorpd xmm0, xmm0        // executing SSE2 instruction
-    }
-    __except (1)
-    {
-        return false;
-    }
-    return true;
-#else
-    typedef void (*SigHandler)(int);
-
-    SigHandler oldHandler = signal(SIGILL, SigIllHandler);
-    if (oldHandler == SIG_ERR)
-        return false;
-
-    bool result = true;
-    if (setjmp(s_env))
-        result = false;
-    else
-        __asm __volatile ("xorpd %xmm0, %xmm0");
-
-    signal(SIGILL, oldHandler);
-    return result;
-#endif
-}
-#endif // SSE2_INTRINSICS_AVAILABLE
-
-
-static bool IsP4()
-{
-    if (!IsPentium())
-        return false;
-
-    word32 cpuid[4];
-
-    CpuId(1, cpuid);
-    return ((cpuid[0] >> 8) & 0xf) == 0xf;
-}
-
-// ************** Pentium/P4 optimizations ***************
-
-class PentiumOptimized : public Portable
-{
-public:
-    static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
-                                   unsigned int N);
-    static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
-                                        unsigned int N);
-    static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
-                                         const word *B);
-    static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
-                                         const word *B);
-    static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
-                                               const word *B);
-};
-
-class P4Optimized
-{
-public:
-    static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
-                                   unsigned int N);
-    static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
-                                        unsigned int N);
-#ifdef SSE2_INTRINSICS_AVAILABLE
-    static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
-                                         const word *B);
-    static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
-                                         const word *B);
-    static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
-                                               const word *B);
-#endif
-};
-
-typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B,
-                                        unsigned int N);
-typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B);
-
-static PAddSub s_pAdd, s_pSub;
-#ifdef SSE2_INTRINSICS_AVAILABLE
-static PMul s_pMul4, s_pMul8, s_pMul8B;
-#endif
-
-static void SetPentiumFunctionPointers()
-{
-    if (!IsPentium())
-    {   
-        s_pAdd = &Portable::Add;
-        s_pSub = &Portable::Subtract;
-    }
-    else if (IsP4())
-    {
-        s_pAdd = &P4Optimized::Add;
-        s_pSub = &P4Optimized::Subtract;
-    }
-    else
-    {
-        s_pAdd = &PentiumOptimized::Add;
-        s_pSub = &PentiumOptimized::Subtract;
-    }
-
-#ifdef SSE2_INTRINSICS_AVAILABLE
-    if (!IsPentium()) 
-    {
-        s_pMul4 = &Portable::Multiply4;
-        s_pMul8 = &Portable::Multiply8;
-        s_pMul8B = &Portable::Multiply8Bottom;
-    }
-    else if (HasSSE2())
-    {
-        s_pMul4 = &P4Optimized::Multiply4;
-        s_pMul8 = &P4Optimized::Multiply8;
-        s_pMul8B = &P4Optimized::Multiply8Bottom;
-    }
-    else
-    {
-        s_pMul4 = &PentiumOptimized::Multiply4;
-        s_pMul8 = &PentiumOptimized::Multiply8;
-        s_pMul8B = &PentiumOptimized::Multiply8Bottom;
-    }
-#endif
-}
-
-static const char s_RunAtStartupSetPentiumFunctionPointers =
-    (SetPentiumFunctionPointers(), 0);
-
-
-class LowLevel : public PentiumOptimized
-{
-public:
-    inline static word Add(word *C, const word *A, const word *B,
-                           unsigned int N)
-        {return s_pAdd(C, A, B, N);}
-    inline static word Subtract(word *C, const word *A, const word *B,
-                                unsigned int N)
-        {return s_pSub(C, A, B, N);}
-    inline static void Square4(word *R, const word *A)
-        {Multiply4(R, A, A);}
-#ifdef SSE2_INTRINSICS_AVAILABLE
-    inline static void Multiply4(word *C, const word *A, const word *B)
-        {s_pMul4(C, A, B);}
-    inline static void Multiply8(word *C, const word *A, const word *B)
-        {s_pMul8(C, A, B);}
-    inline static void Multiply8Bottom(word *C, const word *A, const word *B)
-        {s_pMul8B(C, A, B);}
-#endif
-};
-
-// use some tricks to share assembly code between MSVC and GCC
-#ifdef _MSC_VER
-    #define TAOCRYPT_NAKED __declspec(naked)
-    #define AS1(x) __asm x
-    #define AS2(x, y) __asm x, y
-    #define AddPrologue \
-        __asm	push ebp \
-        __asm	push ebx \
-        __asm	push esi \
-        __asm	push edi \
-        __asm	mov		ecx, [esp+20] \
-        __asm	mov		edx, [esp+24] \
-        __asm	mov		ebx, [esp+28] \
-        __asm	mov		esi, [esp+32]
-    #define AddEpilogue \
-        __asm	pop edi \
-        __asm	pop esi \
-        __asm	pop ebx \
-        __asm	pop ebp \
-        __asm	ret
-    #define MulPrologue \
-        __asm	push ebp \
-        __asm	push ebx \
-        __asm	push esi \
-        __asm	push edi \
-        __asm	mov ecx, [esp+28] \
-        __asm	mov esi, [esp+24] \
-        __asm	push [esp+20]
-    #define MulEpilogue \
-        __asm	add esp, 4 \
-        __asm	pop edi \
-        __asm	pop esi \
-        __asm	pop ebx \
-        __asm	pop ebp \
-        __asm	ret
-#else
-    #define TAOCRYPT_NAKED
-    #define AS1(x) #x ";"
-    #define AS2(x, y) #x ", " #y ";"
-    #define AddPrologue \
-        word res; \
-        __asm__ __volatile__ \
-        ( \
-            "push %%ebx;"	/* save this manually, in case of -fPIC */ \
-            "mov %3, %%ebx;" \
-            ".intel_syntax noprefix;" \
-            "push ebp;"
-    #define AddEpilogue \
-            "pop ebp;" \
-            ".att_syntax prefix;" \
-            "pop %%ebx;" \
-            "mov %%eax, %0;" \
-                    : "=g" (res) \
-                    : "c" (C), "d" (A), "m" (B), "S" (N) \
-                    : "%edi", "memory", "cc" \
-        ); \
-        return res;
-
-    #define MulPrologue \
-        __asm__ __volatile__ \
-        ( \
-            "push %%ebx;"	/* save this manually, in case of -fPIC */ \
-            "push %%ebp;" \
-            "push %0;" \
-            ".intel_syntax noprefix;"
-    #define MulEpilogue \
-            "add esp, 4;" \
-            "pop ebp;" \
-            "pop ebx;" \
-            ".att_syntax prefix;" \
-            : \
-            : "rm" (Z), "S" (X), "c" (Y) \
-            : "%eax", "%edx", "%edi", "memory", "cc" \
-        );
-#endif
-
-TAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A,
-                                          const word *B, unsigned int N)
-{
-    AddPrologue
-
-    // now: ebx = B, ecx = C, edx = A, esi = N
-    AS2(    sub ecx, edx)           // hold the distance between C & A so we
-                                    // can add this to A to get C
-    AS2(    xor eax, eax)           // clear eax
-
-    AS2(    sub eax, esi)           // eax is a negative index from end of B
-    AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B
-
-    AS2(    sar eax, 1)             // unit of eax is now dwords; this also
-                                    // clears the carry flag
-    AS1(    jz  loopendAdd)         // if no dwords then nothing to do
-
-    AS1(loopstartAdd:)
-    AS2(    mov    esi,[edx])           // load lower word of A
-    AS2(    mov    ebp,[edx+4])         // load higher word of A
-
-    AS2(    mov    edi,[ebx+8*eax])     // load lower word of B
-    AS2(    lea    edx,[edx+8])         // advance A and C
-
-    AS2(    adc    esi,edi)             // add lower words
-    AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B
-
-    AS2(    adc    ebp,edi)             // add higher words
-    AS1(    inc    eax)                 // advance B
-
-    AS2(    mov    [edx+ecx-8],esi)     // store lower word result
-    AS2(    mov    [edx+ecx-4],ebp)     // store higher word result
-
-    AS1(    jnz    loopstartAdd)   // loop until eax overflows and becomes zero
-
-    AS1(loopendAdd:)
-    AS2(    adc eax, 0)     // store carry into eax (return result register)
-
-    AddEpilogue
-}
-
-TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A,
-                                               const word *B, unsigned int N)
-{
-    AddPrologue
-
-    // now: ebx = B, ecx = C, edx = A, esi = N
-    AS2(    sub ecx, edx)           // hold the distance between C & A so we
-                                    // can add this to A to get C
-    AS2(    xor eax, eax)           // clear eax
-
-    AS2(    sub eax, esi)           // eax is a negative index from end of B
-    AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B
-
-    AS2(    sar eax, 1)             // unit of eax is now dwords; this also
-                                    // clears the carry flag
-    AS1(    jz  loopendSub)         // if no dwords then nothing to do
-
-    AS1(loopstartSub:)
-    AS2(    mov    esi,[edx])           // load lower word of A
-    AS2(    mov    ebp,[edx+4])         // load higher word of A
-
-    AS2(    mov    edi,[ebx+8*eax])     // load lower word of B
-    AS2(    lea    edx,[edx+8])         // advance A and C
-
-    AS2(    sbb    esi,edi)             // subtract lower words
-    AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B
-
-    AS2(    sbb    ebp,edi)             // subtract higher words
-    AS1(    inc    eax)                 // advance B
-
-    AS2(    mov    [edx+ecx-8],esi)     // store lower word result
-    AS2(    mov    [edx+ecx-4],ebp)     // store higher word result
-
-    AS1(    jnz    loopstartSub)   // loop until eax overflows and becomes zero
-
-    AS1(loopendSub:)
-    AS2(    adc eax, 0)     // store carry into eax (return result register)
-
-    AddEpilogue
-}
-
-// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
-
-TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B,
-                                     unsigned int N)
-{
-    AddPrologue
-
-    // now: ebx = B, ecx = C, edx = A, esi = N
-    AS2(    xor     eax, eax)
-    AS1(    neg     esi)
-    AS1(    jz      loopendAddP4)       // if no dwords then nothing to do
-
-    AS2(    mov     edi, [edx])
-    AS2(    mov     ebp, [ebx])
-    AS1(    jmp     carry1AddP4)
-
-    AS1(loopstartAddP4:)
-    AS2(    mov     edi, [edx+8])
-    AS2(    add     ecx, 8)
-    AS2(    add     edx, 8)
-    AS2(    mov     ebp, [ebx])
-    AS2(    add     edi, eax)
-    AS1(    jc      carry1AddP4)
-    AS2(    xor     eax, eax)
-
-    AS1(carry1AddP4:)
-    AS2(    add     edi, ebp)
-    AS2(    mov     ebp, 1)
-    AS2(    mov     [ecx], edi)
-    AS2(    mov     edi, [edx+4])
-    AS2(    cmovc   eax, ebp)
-    AS2(    mov     ebp, [ebx+4])
-    AS2(    add     ebx, 8)
-    AS2(    add     edi, eax)
-    AS1(    jc      carry2AddP4)
-    AS2(    xor     eax, eax)
-
-    AS1(carry2AddP4:)
-    AS2(    add     edi, ebp)
-    AS2(    mov     ebp, 1)
-    AS2(    cmovc   eax, ebp)
-    AS2(    mov     [ecx+4], edi)
-    AS2(    add     esi, 2)
-    AS1(    jnz     loopstartAddP4)
-
-    AS1(loopendAddP4:)
-
-    AddEpilogue
-}
-
-TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A,
-                                          const word *B, unsigned int N)
-{
-    AddPrologue
-
-    // now: ebx = B, ecx = C, edx = A, esi = N
-    AS2(    xor     eax, eax)
-    AS1(    neg     esi)
-    AS1(    jz      loopendSubP4)       // if no dwords then nothing to do
-
-    AS2(    mov     edi, [edx])
-    AS2(    mov     ebp, [ebx])
-    AS1(    jmp     carry1SubP4)
-
-    AS1(loopstartSubP4:)
-    AS2(    mov     edi, [edx+8])
-    AS2(    add     edx, 8)
-    AS2(    add     ecx, 8)
-    AS2(    mov     ebp, [ebx])
-    AS2(    sub     edi, eax)
-    AS1(    jc      carry1SubP4)
-    AS2(    xor     eax, eax)
-
-    AS1(carry1SubP4:)
-    AS2(    sub     edi, ebp)
-    AS2(    mov     ebp, 1)
-    AS2(    mov     [ecx], edi)
-    AS2(    mov     edi, [edx+4])
-    AS2(    cmovc   eax, ebp)
-    AS2(    mov     ebp, [ebx+4])
-    AS2(    add     ebx, 8)
-    AS2(    sub     edi, eax)
-    AS1(    jc      carry2SubP4)
-    AS2(    xor     eax, eax)
-
-    AS1(carry2SubP4:)
-    AS2(    sub     edi, ebp)
-    AS2(    mov     ebp, 1)
-    AS2(    cmovc   eax, ebp)
-    AS2(    mov     [ecx+4], edi)
-    AS2(    add     esi, 2)
-    AS1(    jnz     loopstartSubP4)
-
-    AS1(loopendSubP4:)
-
-    AddEpilogue
-}
-
-// multiply assembly code originally contributed by Leonard Janke
-
-#define MulStartup \
-    AS2(xor ebp, ebp) \
-    AS2(xor edi, edi) \
-    AS2(xor ebx, ebx) 
-
-#define MulShiftCarry \
-    AS2(mov ebp, edx) \
-    AS2(mov edi, ebx) \
-    AS2(xor ebx, ebx)
-
-#define MulAccumulateBottom(i,j) \
-    AS2(mov eax, [ecx+4*j]) \
-    AS2(imul eax, dword ptr [esi+4*i]) \
-    AS2(add ebp, eax)
-
-#define MulAccumulate(i,j) \
-    AS2(mov eax, [ecx+4*j]) \
-    AS1(mul dword ptr [esi+4*i]) \
-    AS2(add ebp, eax) \
-    AS2(adc edi, edx) \
-    AS2(adc bl, bh)
-
-#define MulStoreDigit(i)  \
-    AS2(mov edx, edi) \
-    AS2(mov edi, [esp]) \
-    AS2(mov [edi+4*i], ebp)
-
-#define MulLastDiagonal(digits) \
-    AS2(mov eax, [ecx+4*(digits-1)]) \
-    AS1(mul dword ptr [esi+4*(digits-1)]) \
-    AS2(add ebp, eax) \
-    AS2(adc edx, edi) \
-    AS2(mov edi, [esp]) \
-    AS2(mov [edi+4*(2*digits-2)], ebp) \
-    AS2(mov [edi+4*(2*digits-1)], edx)
-
-TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X,
-                                                const word* Y)
-{
-    MulPrologue
-    // now: [esp] = Z, esi = X, ecx = Y
-    MulStartup
-    MulAccumulate(0,0)
-    MulStoreDigit(0)
-    MulShiftCarry
-
-    MulAccumulate(1,0)
-    MulAccumulate(0,1)
-    MulStoreDigit(1)
-    MulShiftCarry
-
-    MulAccumulate(2,0)
-    MulAccumulate(1,1)
-    MulAccumulate(0,2)
-    MulStoreDigit(2)
-    MulShiftCarry
-
-    MulAccumulate(3,0)
-    MulAccumulate(2,1)
-    MulAccumulate(1,2)
-    MulAccumulate(0,3)
-    MulStoreDigit(3)
-    MulShiftCarry
-
-    MulAccumulate(3,1)
-    MulAccumulate(2,2)
-    MulAccumulate(1,3)
-    MulStoreDigit(4)
-    MulShiftCarry
-
-    MulAccumulate(3,2)
-    MulAccumulate(2,3)
-    MulStoreDigit(5)
-    MulShiftCarry
-
-    MulLastDiagonal(4)
-    MulEpilogue
-}
-
-TAOCRYPT_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X,
-                                                const word* Y)
-{
-    MulPrologue
-    // now: [esp] = Z, esi = X, ecx = Y
-    MulStartup
-    MulAccumulate(0,0)
-    MulStoreDigit(0)
-    MulShiftCarry
-
-    MulAccumulate(1,0)
-    MulAccumulate(0,1)
-    MulStoreDigit(1)
-    MulShiftCarry
-
-    MulAccumulate(2,0)
-    MulAccumulate(1,1)
-    MulAccumulate(0,2)
-    MulStoreDigit(2)
-    MulShiftCarry
-
-    MulAccumulate(3,0)
-    MulAccumulate(2,1)
-    MulAccumulate(1,2)
-    MulAccumulate(0,3)
-    MulStoreDigit(3)
-    MulShiftCarry
-
-    MulAccumulate(4,0)
-    MulAccumulate(3,1)
-    MulAccumulate(2,2)
-    MulAccumulate(1,3)
-    MulAccumulate(0,4)
-    MulStoreDigit(4)
-    MulShiftCarry
-
-    MulAccumulate(5,0)
-    MulAccumulate(4,1)
-    MulAccumulate(3,2)
-    MulAccumulate(2,3)
-    MulAccumulate(1,4)
-    MulAccumulate(0,5)
-    MulStoreDigit(5)
-    MulShiftCarry
-
-    MulAccumulate(6,0)
-    MulAccumulate(5,1)
-    MulAccumulate(4,2)
-    MulAccumulate(3,3)
-    MulAccumulate(2,4)
-    MulAccumulate(1,5)
-    MulAccumulate(0,6)
-    MulStoreDigit(6)
-    MulShiftCarry
-
-    MulAccumulate(7,0)
-    MulAccumulate(6,1)
-    MulAccumulate(5,2)
-    MulAccumulate(4,3)
-    MulAccumulate(3,4)
-    MulAccumulate(2,5)
-    MulAccumulate(1,6)
-    MulAccumulate(0,7)
-    MulStoreDigit(7)
-    MulShiftCarry
-
-    MulAccumulate(7,1)
-    MulAccumulate(6,2)
-    MulAccumulate(5,3)
-    MulAccumulate(4,4)
-    MulAccumulate(3,5)
-    MulAccumulate(2,6)
-    MulAccumulate(1,7)
-    MulStoreDigit(8)
-    MulShiftCarry
-
-    MulAccumulate(7,2)
-    MulAccumulate(6,3)
-    MulAccumulate(5,4)
-    MulAccumulate(4,5)
-    MulAccumulate(3,6)
-    MulAccumulate(2,7)
-    MulStoreDigit(9)
-    MulShiftCarry
-
-    MulAccumulate(7,3)
-    MulAccumulate(6,4)
-    MulAccumulate(5,5)
-    MulAccumulate(4,6)
-    MulAccumulate(3,7)
-    MulStoreDigit(10)
-    MulShiftCarry
-
-    MulAccumulate(7,4)
-    MulAccumulate(6,5)
-    MulAccumulate(5,6)
-    MulAccumulate(4,7)
-    MulStoreDigit(11)
-    MulShiftCarry
-
-    MulAccumulate(7,5)
-    MulAccumulate(6,6)
-    MulAccumulate(5,7)
-    MulStoreDigit(12)
-    MulShiftCarry
-
-    MulAccumulate(7,6)
-    MulAccumulate(6,7)
-    MulStoreDigit(13)
-    MulShiftCarry
-
-    MulLastDiagonal(8)
-    MulEpilogue
-}
-
-TAOCRYPT_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X,
-                                                      const word* Y)
-{
-    MulPrologue
-    // now: [esp] = Z, esi = X, ecx = Y
-    MulStartup
-    MulAccumulate(0,0)
-    MulStoreDigit(0)
-    MulShiftCarry
-
-    MulAccumulate(1,0)
-    MulAccumulate(0,1)
-    MulStoreDigit(1)
-    MulShiftCarry
-
-    MulAccumulate(2,0)
-    MulAccumulate(1,1)
-    MulAccumulate(0,2)
-    MulStoreDigit(2)
-    MulShiftCarry
-
-    MulAccumulate(3,0)
-    MulAccumulate(2,1)
-    MulAccumulate(1,2)
-    MulAccumulate(0,3)
-    MulStoreDigit(3)
-    MulShiftCarry
-
-    MulAccumulate(4,0)
-    MulAccumulate(3,1)
-    MulAccumulate(2,2)
-    MulAccumulate(1,3)
-    MulAccumulate(0,4)
-    MulStoreDigit(4)
-    MulShiftCarry
-
-    MulAccumulate(5,0)
-    MulAccumulate(4,1)
-    MulAccumulate(3,2)
-    MulAccumulate(2,3)
-    MulAccumulate(1,4)
-    MulAccumulate(0,5)
-    MulStoreDigit(5)
-    MulShiftCarry
-
-    MulAccumulate(6,0)
-    MulAccumulate(5,1)
-    MulAccumulate(4,2)
-    MulAccumulate(3,3)
-    MulAccumulate(2,4)
-    MulAccumulate(1,5)
-    MulAccumulate(0,6)
-    MulStoreDigit(6)
-    MulShiftCarry
-
-    MulAccumulateBottom(7,0)
-    MulAccumulateBottom(6,1)
-    MulAccumulateBottom(5,2)
-    MulAccumulateBottom(4,3)
-    MulAccumulateBottom(3,4)
-    MulAccumulateBottom(2,5)
-    MulAccumulateBottom(1,6)
-    MulAccumulateBottom(0,7)
-    MulStoreDigit(7)
-    MulEpilogue
-}
-
-#undef AS1
-#undef AS2
-
-#else	// not x86 - no processor specific code at this layer
-
-typedef Portable LowLevel;
-
-#endif
-
-#ifdef SSE2_INTRINSICS_AVAILABLE
-
-#ifdef __GNUC__
-#define TAOCRYPT_FASTCALL
-#else
-#define TAOCRYPT_FASTCALL __fastcall
-#endif
-
-static void TAOCRYPT_FASTCALL P4_Mul(__m128i *C, const __m128i *A,
-                                     const __m128i *B)
-{
-    __m128i a3210 = _mm_load_si128(A);
-    __m128i b3210 = _mm_load_si128(B);
-
-    __m128i sum;
-
-    __m128i z = _mm_setzero_si128();
-    __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
-    C[0] = a2b2_a0b0;
-
-    __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
-    __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
-    __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
-    __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
-    __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
-    C[1] = _mm_add_epi64(a1b0, a0b1);
-
-    __m128i a31 = _mm_srli_epi64(a3210, 32);
-    __m128i b31 = _mm_srli_epi64(b3210, 32);
-    __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
-    C[6] = a3b3_a1b1;
-
-    __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
-    __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
-    __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
-    __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
-    __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
-    sum = _mm_add_epi64(a1b1, a0b2);
-    C[2] = _mm_add_epi64(sum, a2b0);
-
-    __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
-    __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
-    __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
-    __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
-    __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
-    __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
-    __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
-    __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
-    __m128i sum1 = _mm_add_epi64(a3b0, a1b2);
-    sum = _mm_add_epi64(a2b1, a0b3);
-    C[3] = _mm_add_epi64(sum, sum1);
-
-    __m128i	a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
-    __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
-    __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
-    __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
-    sum = _mm_add_epi64(a2b2, a3b1);
-    C[4] = _mm_add_epi64(sum, a1b3);
-
-    __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
-    __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
-    __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
-    __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
-    __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
-    C[5] = _mm_add_epi64(a3b2, a2b3);
-}
-
-void P4Optimized::Multiply4(word *C, const word *A, const word *B)
-{
-    __m128i temp[7];
-    const word *w = (word *)temp;
-    const __m64 *mw = (__m64 *)w;
-
-    P4_Mul(temp, (__m128i *)A, (__m128i *)B);
-
-    C[0] = w[0];
-
-    __m64 s1, s2;
-
-    __m64 w1 = _mm_cvtsi32_si64(w[1]);
-    __m64 w4 = mw[2];
-    __m64 w6 = mw[3];
-    __m64 w8 = mw[4];
-    __m64 w10 = mw[5];
-    __m64 w12 = mw[6];
-    __m64 w14 = mw[7];
-    __m64 w16 = mw[8];
-    __m64 w18 = mw[9];
-    __m64 w20 = mw[10];
-    __m64 w22 = mw[11];
-    __m64 w26 = _mm_cvtsi32_si64(w[26]);
-
-    s1 = _mm_add_si64(w1, w4);
-    C[1] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s2 = _mm_add_si64(w6, w8);
-    s1 = _mm_add_si64(s1, s2);
-    C[2] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s2 = _mm_add_si64(w10, w12);
-    s1 = _mm_add_si64(s1, s2);
-    C[3] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s2 = _mm_add_si64(w14, w16);
-    s1 = _mm_add_si64(s1, s2);
-    C[4] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s2 = _mm_add_si64(w18, w20);
-    s1 = _mm_add_si64(s1, s2);
-    C[5] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s2 = _mm_add_si64(w22, w26);
-    s1 = _mm_add_si64(s1, s2);
-    C[6] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    C[7] = _mm_cvtsi64_si32(s1) + w[27];
-    _mm_empty();
-}
-
-void P4Optimized::Multiply8(word *C, const word *A, const word *B)
-{
-    __m128i temp[28];
-    const word *w = (word *)temp;
-    const __m64 *mw = (__m64 *)w;
-    const word *x = (word *)temp+7*4;
-    const __m64 *mx = (__m64 *)x;
-    const word *y = (word *)temp+7*4*2;
-    const __m64 *my = (__m64 *)y;
-    const word *z = (word *)temp+7*4*3;
-    const __m64 *mz = (__m64 *)z;
-
-    P4_Mul(temp, (__m128i *)A, (__m128i *)B);
-
-    P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
-
-    P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
-
-    P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
-
-    C[0] = w[0];
-
-    __m64 s1, s2, s3, s4;
-
-    __m64 w1 = _mm_cvtsi32_si64(w[1]);
-    __m64 w4 = mw[2];
-    __m64 w6 = mw[3];
-    __m64 w8 = mw[4];
-    __m64 w10 = mw[5];
-    __m64 w12 = mw[6];
-    __m64 w14 = mw[7];
-    __m64 w16 = mw[8];
-    __m64 w18 = mw[9];
-    __m64 w20 = mw[10];
-    __m64 w22 = mw[11];
-    __m64 w26 = _mm_cvtsi32_si64(w[26]);
-    __m64 w27 = _mm_cvtsi32_si64(w[27]);
-
-    __m64 x0 = _mm_cvtsi32_si64(x[0]);
-    __m64 x1 = _mm_cvtsi32_si64(x[1]);
-    __m64 x4 = mx[2];
-    __m64 x6 = mx[3];
-    __m64 x8 = mx[4];
-    __m64 x10 = mx[5];
-    __m64 x12 = mx[6];
-    __m64 x14 = mx[7];
-    __m64 x16 = mx[8];
-    __m64 x18 = mx[9];
-    __m64 x20 = mx[10];
-    __m64 x22 = mx[11];
-    __m64 x26 = _mm_cvtsi32_si64(x[26]);
-    __m64 x27 = _mm_cvtsi32_si64(x[27]);
-
-    __m64 y0 = _mm_cvtsi32_si64(y[0]);
-    __m64 y1 = _mm_cvtsi32_si64(y[1]);
-    __m64 y4 = my[2];
-    __m64 y6 = my[3];
-    __m64 y8 = my[4];
-    __m64 y10 = my[5];
-    __m64 y12 = my[6];
-    __m64 y14 = my[7];
-    __m64 y16 = my[8];
-    __m64 y18 = my[9];
-    __m64 y20 = my[10];
-    __m64 y22 = my[11];
-    __m64 y26 = _mm_cvtsi32_si64(y[26]);
-    __m64 y27 = _mm_cvtsi32_si64(y[27]);
-
-    __m64 z0 = _mm_cvtsi32_si64(z[0]);
-    __m64 z1 = _mm_cvtsi32_si64(z[1]);
-    __m64 z4 = mz[2];
-    __m64 z6 = mz[3];
-    __m64 z8 = mz[4];
-    __m64 z10 = mz[5];
-    __m64 z12 = mz[6];
-    __m64 z14 = mz[7];
-    __m64 z16 = mz[8];
-    __m64 z18 = mz[9];
-    __m64 z20 = mz[10];
-    __m64 z22 = mz[11];
-    __m64 z26 = _mm_cvtsi32_si64(z[26]);
-
-    s1 = _mm_add_si64(w1, w4);
-    C[1] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s2 = _mm_add_si64(w6, w8);
-    s1 = _mm_add_si64(s1, s2);
-    C[2] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s2 = _mm_add_si64(w10, w12);
-    s1 = _mm_add_si64(s1, s2);
-    C[3] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x0, y0);
-    s2 = _mm_add_si64(w14, w16);
-    s1 = _mm_add_si64(s1, s3);
-    s1 = _mm_add_si64(s1, s2);
-    C[4] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x1, y1);
-    s4 = _mm_add_si64(x4, y4);
-    s1 = _mm_add_si64(s1, w18);
-    s3 = _mm_add_si64(s3, s4);
-    s1 = _mm_add_si64(s1, w20);
-    s1 = _mm_add_si64(s1, s3);
-    C[5] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x6, y6);
-    s4 = _mm_add_si64(x8, y8);
-    s1 = _mm_add_si64(s1, w22);
-    s3 = _mm_add_si64(s3, s4);
-    s1 = _mm_add_si64(s1, w26);
-    s1 = _mm_add_si64(s1, s3);
-    C[6] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x10, y10);
-    s4 = _mm_add_si64(x12, y12);
-    s1 = _mm_add_si64(s1, w27);
-    s3 = _mm_add_si64(s3, s4);
-    s1 = _mm_add_si64(s1, s3);
-    C[7] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x14, y14);
-    s4 = _mm_add_si64(x16, y16);
-    s1 = _mm_add_si64(s1, z0);
-    s3 = _mm_add_si64(s3, s4);
-    s1 = _mm_add_si64(s1, s3);
-    C[8] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x18, y18);
-    s4 = _mm_add_si64(x20, y20);
-    s1 = _mm_add_si64(s1, z1);
-    s3 = _mm_add_si64(s3, s4);
-    s1 = _mm_add_si64(s1, z4);
-    s1 = _mm_add_si64(s1, s3);
-    C[9] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x22, y22);
-    s4 = _mm_add_si64(x26, y26);
-    s1 = _mm_add_si64(s1, z6);
-    s3 = _mm_add_si64(s3, s4);
-    s1 = _mm_add_si64(s1, z8);
-    s1 = _mm_add_si64(s1, s3);
-    C[10] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x27, y27);
-    s1 = _mm_add_si64(s1, z10);
-    s1 = _mm_add_si64(s1, z12);
-    s1 = _mm_add_si64(s1, s3);
-    C[11] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(z14, z16);
-    s1 = _mm_add_si64(s1, s3);
-    C[12] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(z18, z20);
-    s1 = _mm_add_si64(s1, s3);
-    C[13] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(z22, z26);
-    s1 = _mm_add_si64(s1, s3);
-    C[14] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    C[15] = z[27] + _mm_cvtsi64_si32(s1);
-    _mm_empty();
-}
-
-void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
-{
-    __m128i temp[21];
-    const word *w = (word *)temp;
-    const __m64 *mw = (__m64 *)w;
-    const word *x = (word *)temp+7*4;
-    const __m64 *mx = (__m64 *)x;
-    const word *y = (word *)temp+7*4*2;
-    const __m64 *my = (__m64 *)y;
-
-    P4_Mul(temp, (__m128i *)A, (__m128i *)B);
-
-    P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
-
-    P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
-
-    C[0] = w[0];
-
-    __m64 s1, s2, s3, s4;
-
-    __m64 w1 = _mm_cvtsi32_si64(w[1]);
-    __m64 w4 = mw[2];
-    __m64 w6 = mw[3];
-    __m64 w8 = mw[4];
-    __m64 w10 = mw[5];
-    __m64 w12 = mw[6];
-    __m64 w14 = mw[7];
-    __m64 w16 = mw[8];
-    __m64 w18 = mw[9];
-    __m64 w20 = mw[10];
-    __m64 w22 = mw[11];
-    __m64 w26 = _mm_cvtsi32_si64(w[26]);
-
-    __m64 x0 = _mm_cvtsi32_si64(x[0]);
-    __m64 x1 = _mm_cvtsi32_si64(x[1]);
-    __m64 x4 = mx[2];
-    __m64 x6 = mx[3];
-    __m64 x8 = mx[4];
-
-    __m64 y0 = _mm_cvtsi32_si64(y[0]);
-    __m64 y1 = _mm_cvtsi32_si64(y[1]);
-    __m64 y4 = my[2];
-    __m64 y6 = my[3];
-    __m64 y8 = my[4];
-
-    s1 = _mm_add_si64(w1, w4);
-    C[1] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s2 = _mm_add_si64(w6, w8);
-    s1 = _mm_add_si64(s1, s2);
-    C[2] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s2 = _mm_add_si64(w10, w12);
-    s1 = _mm_add_si64(s1, s2);
-    C[3] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x0, y0);
-    s2 = _mm_add_si64(w14, w16);
-    s1 = _mm_add_si64(s1, s3);
-    s1 = _mm_add_si64(s1, s2);
-    C[4] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x1, y1);
-    s4 = _mm_add_si64(x4, y4);
-    s1 = _mm_add_si64(s1, w18);
-    s3 = _mm_add_si64(s3, s4);
-    s1 = _mm_add_si64(s1, w20);
-    s1 = _mm_add_si64(s1, s3);
-    C[5] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    s3 = _mm_add_si64(x6, y6);
-    s4 = _mm_add_si64(x8, y8);
-    s1 = _mm_add_si64(s1, w22);
-    s3 = _mm_add_si64(s3, s4);
-    s1 = _mm_add_si64(s1, w26);
-    s1 = _mm_add_si64(s1, s3);
-    C[6] = _mm_cvtsi64_si32(s1);
-    s1 = _mm_srli_si64(s1, 32);
-
-    C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
-    _mm_empty();
-}
-
-#endif	// #ifdef SSE2_INTRINSICS_AVAILABLE
-
-// end optimized
-
-// ********************************************************
-
-#define A0      A
-#define A1      (A+N2)
-#define B0      B
-#define B1      (B+N2)
-
-#define T0      T
-#define T1      (T+N2)
-#define T2      (T+N)
-#define T3      (T+N+N2)
-
-#define R0      R
-#define R1      (R+N2)
-#define R2      (R+N)
-#define R3      (R+N+N2)
-
-//VC60 workaround: compiler bug triggered without the extra dummy parameters
-
-// R[2*N] - result = A*B
-// T[2*N] - temporary work space
-// A[N] --- multiplier
-// B[N] --- multiplicant
-
-
-void RecursiveMultiply(word *R, word *T, const word *A, const word *B,
-                       unsigned int N)
-{
-    if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
-        LowLevel::Multiply8(R, A, B);
-    else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
-        LowLevel::Multiply4(R, A, B);
-    else if (N==2)
-        LowLevel::Multiply2(R, A, B);
-    else
-    {
-        const unsigned int N2 = N/2;
-        int carry;
-
-        int aComp = Compare(A0, A1, N2);
-        int bComp = Compare(B0, B1, N2);
-
-        switch (2*aComp + aComp + bComp)
-        {
-        case -4:
-            LowLevel::Subtract(R0, A1, A0, N2);
-            LowLevel::Subtract(R1, B0, B1, N2);
-            RecursiveMultiply(T0, T2, R0, R1, N2);
-            LowLevel::Subtract(T1, T1, R0, N2);
-            carry = -1;
-            break;
-        case -2:
-            LowLevel::Subtract(R0, A1, A0, N2);
-            LowLevel::Subtract(R1, B0, B1, N2);
-            RecursiveMultiply(T0, T2, R0, R1, N2);
-            carry = 0;
-            break;
-        case 2:
-            LowLevel::Subtract(R0, A0, A1, N2);
-            LowLevel::Subtract(R1, B1, B0, N2);
-            RecursiveMultiply(T0, T2, R0, R1, N2);
-            carry = 0;
-            break;
-        case 4:
-            LowLevel::Subtract(R0, A1, A0, N2);
-            LowLevel::Subtract(R1, B0, B1, N2);
-            RecursiveMultiply(T0, T2, R0, R1, N2);
-            LowLevel::Subtract(T1, T1, R1, N2);
-            carry = -1;
-            break;
-        default:
-            SetWords(T0, 0, N);
-            carry = 0;
-        }
-
-        RecursiveMultiply(R0, T2, A0, B0, N2);
-        RecursiveMultiply(R2, T2, A1, B1, N2);
-
-        // now T[01] holds (A1-A0)*(B0-B1),R[01] holds A0*B0, R[23] holds A1*B1
-
-        carry += LowLevel::Add(T0, T0, R0, N);
-        carry += LowLevel::Add(T0, T0, R2, N);
-        carry += LowLevel::Add(R1, R1, T0, N);
-
-        Increment(R3, N2, carry);
-    }
-}
-
-
-void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)                     
-{
-    if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
-        LowLevel::Square4(R, A);
-    else if (N==2)
-        LowLevel::Square2(R, A);
-    else
-    {
-        const unsigned int N2 = N/2;
-
-        RecursiveSquare(R0, T2, A0, N2);
-        RecursiveSquare(R2, T2, A1, N2);
-        RecursiveMultiply(T0, T2, A0, A1, N2);
-
-        word carry = LowLevel::Add(R1, R1, T0, N);
-        carry += LowLevel::Add(R1, R1, T0, N);
-        Increment(R3, N2, carry);
-    }
-}
-
-
-// R[N] - bottom half of A*B
-// T[N] - temporary work space
-// A[N] - multiplier
-// B[N] - multiplicant
-
-
-void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B,
-                             unsigned int N)
-{
-    if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
-        LowLevel::Multiply8Bottom(R, A, B);
-    else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
-        LowLevel::Multiply4Bottom(R, A, B);
-    else if (N==2)
-        LowLevel::Multiply2Bottom(R, A, B);
-    else
-    {
-        const unsigned int N2 = N/2;
-
-        RecursiveMultiply(R, T, A0, B0, N2);
-        RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
-        LowLevel::Add(R1, R1, T0, N2);
-        RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
-        LowLevel::Add(R1, R1, T0, N2);
-    }
-}
-
-
-void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A,
-                          const word *B, unsigned int N)
-{
-    if (N==4)
-    {
-        LowLevel::Multiply4(T, A, B);
-        memcpy(R, T+4, 4*WORD_SIZE);
-    }
-    else if (N==2)
-    {
-        LowLevel::Multiply2(T, A, B);
-        memcpy(R, T+2, 2*WORD_SIZE);
-    }
-    else
-    {
-        const unsigned int N2 = N/2;
-        int carry;
-
-        int aComp = Compare(A0, A1, N2);
-        int bComp = Compare(B0, B1, N2);
-
-        switch (2*aComp + aComp + bComp)
-        {
-        case -4:
-            LowLevel::Subtract(R0, A1, A0, N2);
-            LowLevel::Subtract(R1, B0, B1, N2);
-            RecursiveMultiply(T0, T2, R0, R1, N2);
-            LowLevel::Subtract(T1, T1, R0, N2);
-            carry = -1;
-            break;
-        case -2:
-            LowLevel::Subtract(R0, A1, A0, N2);
-            LowLevel::Subtract(R1, B0, B1, N2);
-            RecursiveMultiply(T0, T2, R0, R1, N2);
-            carry = 0;
-            break;
-        case 2:
-            LowLevel::Subtract(R0, A0, A1, N2);
-            LowLevel::Subtract(R1, B1, B0, N2);
-            RecursiveMultiply(T0, T2, R0, R1, N2);
-            carry = 0;
-            break;
-        case 4:
-            LowLevel::Subtract(R0, A1, A0, N2);
-            LowLevel::Subtract(R1, B0, B1, N2);
-            RecursiveMultiply(T0, T2, R0, R1, N2);
-            LowLevel::Subtract(T1, T1, R1, N2);
-            carry = -1;
-            break;
-        default:
-            SetWords(T0, 0, N);
-            carry = 0;
-        }
-
-        RecursiveMultiply(T2, R0, A1, B1, N2);
-
-        // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
-
-        word c2 = LowLevel::Subtract(R0, L+N2, L, N2);
-        c2 += LowLevel::Subtract(R0, R0, T0, N2);
-        word t = (Compare(R0, T2, N2) == -1);
-
-        carry += t;
-        carry += Increment(R0, N2, c2+t);
-        carry += LowLevel::Add(R0, R0, T1, N2);
-        carry += LowLevel::Add(R0, R0, T3, N2);
-
-        CopyWords(R1, T3, N2);
-        Increment(R1, N2, carry);
-    }
-}
-
-
-inline word Add(word *C, const word *A, const word *B, unsigned int N)
-{
-    return LowLevel::Add(C, A, B, N);
-}
-
-inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
-{
-    return LowLevel::Subtract(C, A, B, N);
-}
-
-inline void Multiply(word *R, word *T, const word *A, const word *B,
-                     unsigned int N)
-{
-    RecursiveMultiply(R, T, A, B, N);
-}
-
-inline void Square(word *R, word *T, const word *A, unsigned int N)
-{
-    RecursiveSquare(R, T, A, N);
-}
-
-
-void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA,
-                        const word *B, unsigned int NB)
-{
-    if (NA == NB)
-    {
-        if (A == B)
-            Square(R, T, A, NA);
-        else
-            Multiply(R, T, A, B, NA);
-
-        return;
-    }
-
-    if (NA > NB)
-    {
-        STL::swap(A, B);
-        STL::swap(NA, NB);
-    }
-
-    if (NA==2 && !A[1])
-    {
-        switch (A[0])
-        {
-        case 0:
-            SetWords(R, 0, NB+2);
-            return;
-        case 1:
-            CopyWords(R, B, NB);
-            R[NB] = R[NB+1] = 0;
-            return;
-        default:
-            R[NB] = LinearMultiply(R, B, A[0], NB);
-            R[NB+1] = 0;
-            return;
-        }
-    }
-
-    Multiply(R, T, A, B, NA);
-    CopyWords(T+2*NA, R+NA, NA);
-
-    unsigned i;
-
-    for (i=2*NA; i<NB; i+=2*NA)
-        Multiply(T+NA+i, T, A, B+i, NA);
-    for (i=NA; i<NB; i+=2*NA)
-        Multiply(R+i, T, A, B+i, NA);
-
-    if (Add(R+NA, R+NA, T+2*NA, NB-NA))
-        Increment(R+NB, NA);
-}
-
-
-void PositiveMultiply(Integer& product, const Integer& a, const Integer& b)
-{
-    unsigned int aSize = RoundupSize(a.WordCount());
-    unsigned int bSize = RoundupSize(b.WordCount());
-
-    product.reg_.CleanNew(RoundupSize(aSize + bSize));
-    product.sign_ = Integer::POSITIVE;
-
-    AlignedWordBlock workspace(aSize + bSize);
-    AsymmetricMultiply(product.reg_.get_buffer(), workspace.get_buffer(),
-                       a.reg_.get_buffer(), aSize, b.reg_.get_buffer(), bSize);
-}
-
-void Multiply(Integer &product, const Integer &a, const Integer &b)
-{
-    PositiveMultiply(product, a, b);
-
-    if (a.NotNegative() != b.NotNegative())
-        product.Negate();
-}
-
-
-static inline unsigned int EvenWordCount(const word *X, unsigned int N)
-{
-    while (N && X[N-2]==0 && X[N-1]==0)
-        N-=2;
-    return N;
-}
-
-
-unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA,
-                           const word *M, unsigned int N)
-{
-    word *b = T;
-    word *c = T+N;
-    word *f = T+2*N;
-    word *g = T+3*N;
-    unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
-    unsigned int k=0, s=0;
-
-    SetWords(T, 0, 3*N);
-    b[0]=1;
-    CopyWords(f, A, NA);
-    CopyWords(g, M, N);
-
-    while (1)
-    {
-        word t=f[0];
-        while (!t)
-        {
-            if (EvenWordCount(f, fgLen)==0)
-            {
-                SetWords(R, 0, N);
-                return 0;
-            }
-
-            ShiftWordsRightByWords(f, fgLen, 1);
-            if (c[bcLen-1]) bcLen+=2;
-            ShiftWordsLeftByWords(c, bcLen, 1);
-            k+=WORD_BITS;
-            t=f[0];
-        }
-
-        unsigned int i=0;
-        while (t%2 == 0)
-        {
-            t>>=1;
-            i++;
-        }
-        k+=i;
-
-        if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
-        {
-            if (s%2==0)
-                CopyWords(R, b, N);
-            else
-                Subtract(R, M, b, N);
-            return k;
-        }
-
-        ShiftWordsRightByBits(f, fgLen, i);
-        t=ShiftWordsLeftByBits(c, bcLen, i);
-        if (t)
-        {
-            c[bcLen] = t;
-            bcLen+=2;
-        }
-
-        if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
-            fgLen-=2;
-
-        if (Compare(f, g, fgLen)==-1)
-        {
-            STL::swap(f, g);
-            STL::swap(b, c);
-            s++;
-        }
-
-        Subtract(f, f, g, fgLen);
-
-        if (Add(b, b, c, bcLen))
-        {
-            b[bcLen] = 1;
-            bcLen+=2;
-        }
-    }
-}
-
-// R[N] - result = A/(2^k) mod M
-// A[N] - input
-// M[N] - modulus
-
-void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
-                       unsigned int N)
-{
-    CopyWords(R, A, N);
-
-    while (k--)
-    {
-        if (R[0]%2==0)
-            ShiftWordsRightByBits(R, N, 1);
-        else
-        {
-            word carry = Add(R, R, M, N);
-            ShiftWordsRightByBits(R, N, 1);
-            R[N-1] += carry<<(WORD_BITS-1);
-        }
-    }
-}
-
-// R[N] - result = A*(2^k) mod M
-// A[N] - input
-// M[N] - modulus
-
-void MultiplyByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
-                         unsigned int N)
-{
-    CopyWords(R, A, N);
-
-    while (k--)
-        if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
-            Subtract(R, R, M, N);
-}
-
-
-// ********** end of integer needs
-
-
-Integer::Integer()
-    : reg_(2), sign_(POSITIVE)
-{
-    reg_[0] = reg_[1] = 0;
-}
-
-
-Integer::Integer(const Integer& t)
-    : reg_(RoundupSize(t.WordCount())), sign_(t.sign_)
-{
-    CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
-}
-
-
-Integer::Integer(signed long value)
-    : reg_(2)
-{
-    if (value >= 0)
-        sign_ = POSITIVE;
-    else
-    {
-        sign_ = NEGATIVE;
-        value = -value;
-    }
-    reg_[0] = word(value);
-    reg_[1] = word(SafeRightShift<WORD_BITS, unsigned long>(value));
-}
-
-
-Integer::Integer(Sign s, word high, word low)
-    : reg_(2), sign_(s)
-{
-    reg_[0] = low;
-    reg_[1] = high;
-}
-
-
-Integer::Integer(word value, unsigned int length)
-    : reg_(RoundupSize(length)), sign_(POSITIVE)
-{
-    reg_[0] = value;
-    SetWords(reg_ + 1, 0, reg_.size() - 1);
-}
-
-
-Integer::Integer(const byte *encodedInteger, unsigned int byteCount,
-                 Signedness s)
-{
-    Decode(encodedInteger, byteCount, s);
-}
-
-class BadBER {};
-
-// BER Decode Source
-Integer::Integer(Source& source)
-    : reg_(2), sign_(POSITIVE)
-{
-    Decode(source);
-}
-
-void Integer::Decode(Source& source)
-{
-    byte b = source.next();
-    if (b != INTEGER) {
-        source.SetError(INTEGER_E);
-        return;
-    }
-
-    word32 length = GetLength(source);
-    if (length == 0 || source.GetError().What()) return;
-
-    if ( (b = source.next()) == 0x00)
-        length--;
-    else
-        source.prev();
-
-    if (source.IsLeft(length) == false) return;
- 
-    unsigned int words = (length + WORD_SIZE - 1) / WORD_SIZE;
-    words = RoundupSize(words);
-    if (words > reg_.size()) reg_.CleanNew(words);
-
-    for (int j = length; j > 0; j--) {
-        b = source.next();
-        reg_ [(j-1) / WORD_SIZE] |= (word)b << ((j-1) % WORD_SIZE) * 8;
-    }
-}
-
-
-void Integer::Decode(const byte* input, unsigned int inputLen, Signedness s)
-{
-    unsigned int idx(0);
-    byte b = 0; 
-    if (inputLen>0)
-        b = input[idx];   // peek
-    sign_  = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
-
-    while (inputLen>0 && (sign_==POSITIVE ? b==0 : b==0xff))
-    {
-        idx++;   // skip
-        if (--inputLen>0)
-            b = input[idx];  // peek
-    }
-
-    reg_.CleanNew(RoundupSize(BytesToWords(inputLen)));
-
-    for (unsigned int i=inputLen; i > 0; i--)
-    {
-        b = input[idx++];
-        reg_[(i-1)/WORD_SIZE] |= (word)b << ((i-1)%WORD_SIZE)*8;
-    }
-
-    if (sign_ == NEGATIVE)
-    {
-        for (unsigned i=inputLen; i<reg_.size()*WORD_SIZE; i++)
-            reg_[i/WORD_SIZE] |= (word)0xff << (i%WORD_SIZE)*8;
-        TwosComplement(reg_.get_buffer(), reg_.size());
-    }
-}
-
-
-unsigned int Integer::Encode(byte* output, unsigned int outputLen,
-                       Signedness signedness) const
-{
-    unsigned int idx(0);
-    if (signedness == UNSIGNED || NotNegative())
-    {
-        for (unsigned int i=outputLen; i > 0; i--)
-            output[idx++] = GetByte(i-1);
-    }
-    else
-    {
-        // take two's complement of *this
-        Integer temp = Integer::Power2(8*max(ByteCount(), outputLen)) + *this;
-        for (unsigned i=0; i<outputLen; i++)
-            output[idx++] = temp.GetByte(outputLen-i-1);
-    }
-    return outputLen;
-}
-
-
-static Integer* zero = 0;
-
-const Integer &Integer::Zero()
-{
-    if (!zero)
-        zero = NEW_TC Integer;
-    return *zero;
-}
-
-
-static Integer* one = 0;
-
-const Integer &Integer::One()
-{
-    if (!one)
-        one = NEW_TC Integer(1,2);
-    return *one;
-}
-
-
-// Clean up static singleton holders, not a leak, but helpful to have gone
-// when checking for leaks
-void CleanUp()
-{
-    tcDelete(one);
-    tcDelete(zero);
-
-    // In case user calls more than once, prevent seg fault
-    one  = 0;
-    zero = 0;
-}
-
-Integer::Integer(RandomNumberGenerator& rng, const Integer& min,
-                 const Integer& max)
-{
-    Randomize(rng, min, max);
-}
-
-
-void Integer::Randomize(RandomNumberGenerator& rng, unsigned int nbits)
-{
-    const unsigned int nbytes = nbits/8 + 1;
-    ByteBlock buf(nbytes);
-    rng.GenerateBlock(buf.get_buffer(), nbytes);
-    if (nbytes)
-        buf[0] = (byte)Crop(buf[0], nbits % 8);
-    Decode(buf.get_buffer(), nbytes, UNSIGNED);
-}
-
-void Integer::Randomize(RandomNumberGenerator& rng, const Integer& min,
-                        const Integer& max)
-{
-    Integer range = max - min;
-    const unsigned int nbits = range.BitCount();
-
-    do
-    {
-        Randomize(rng, nbits);
-    }
-    while (*this > range);
-
-    *this += min;
-}
-
-
-Integer Integer::Power2(unsigned int e)
-{
-    Integer r((word)0, BitsToWords(e + 1));
-    r.SetBit(e);
-    return r;
-}
-
-
-void Integer::SetBit(unsigned int n, bool value)
-{
-    if (value)
-    {
-        reg_.CleanGrow(RoundupSize(BitsToWords(n + 1)));
-        reg_[n / WORD_BITS] |= (word(1) << (n % WORD_BITS));
-    }
-    else
-    {
-        if (n / WORD_BITS < reg_.size())
-            reg_[n / WORD_BITS] &= ~(word(1) << (n % WORD_BITS));
-    }
-}
-
-
-void Integer::SetByte(unsigned int n, byte value)
-{
-    reg_.CleanGrow(RoundupSize(BytesToWords(n+1)));
-    reg_[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
-    reg_[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
-}
-
-
-void Integer::Negate()
-{
-    if (!!(*this))	// don't flip sign if *this==0
-        sign_ = Sign(1 - sign_);
-}
-
-
-bool Integer::operator!() const
-{
-    return IsNegative() ? false : (reg_[0]==0 && WordCount()==0);
-}
-
-
-Integer& Integer::operator=(const Integer& t)
-{
-    if (this != &t)
-    {
-        reg_.New(RoundupSize(t.WordCount()));
-        CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
-        sign_ = t.sign_;
-    }
-    return *this;
-}
-
-
-Integer& Integer::operator+=(const Integer& t)
-{
-    reg_.CleanGrow(t.reg_.size());
-    if (NotNegative())
-    {
-        if (t.NotNegative())
-            PositiveAdd(*this, *this, t);
-        else
-            PositiveSubtract(*this, *this, t);
-    }
-    else
-    {
-        if (t.NotNegative())
-            PositiveSubtract(*this, t, *this);
-        else
-        {
-            PositiveAdd(*this, *this, t);
-            sign_ = Integer::NEGATIVE;
-        }
-    }
-    return *this;
-}
-
-
-Integer Integer::operator-() const
-{
-    Integer result(*this);
-    result.Negate();
-    return result;
-}
-
-
-Integer& Integer::operator-=(const Integer& t)
-{
-    reg_.CleanGrow(t.reg_.size());
-    if (NotNegative())
-    {
-        if (t.NotNegative())
-            PositiveSubtract(*this, *this, t);
-        else
-            PositiveAdd(*this, *this, t);
-    }
-    else
-    {
-        if (t.NotNegative())
-        {
-            PositiveAdd(*this, *this, t);
-            sign_ = Integer::NEGATIVE;
-        }
-        else
-            PositiveSubtract(*this, t, *this);
-    }
-    return *this;
-}
-
-
-Integer& Integer::operator++()
-{
-    if (NotNegative())
-    {
-        if (Increment(reg_.get_buffer(), reg_.size()))
-        {
-            reg_.CleanGrow(2*reg_.size());
-            reg_[reg_.size()/2]=1;
-        }
-    }
-    else
-    {
-        word borrow = Decrement(reg_.get_buffer(), reg_.size());
-        (void)borrow;           // shut up compiler
-        if (WordCount()==0)
-            *this = Zero();
-    }
-    return *this;
-}
-
-Integer& Integer::operator--()
-{
-    if (IsNegative())
-    {
-        if (Increment(reg_.get_buffer(), reg_.size()))
-        {
-            reg_.CleanGrow(2*reg_.size());
-            reg_[reg_.size()/2]=1;
-        }
-    }
-    else
-    {
-        if (Decrement(reg_.get_buffer(), reg_.size()))
-            *this = -One();
-    }
-    return *this;
-}
-
-
-Integer& Integer::operator<<=(unsigned int n)
-{
-    const unsigned int wordCount = WordCount();
-    const unsigned int shiftWords = n / WORD_BITS;
-    const unsigned int shiftBits = n % WORD_BITS;
-
-    reg_.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
-    ShiftWordsLeftByWords(reg_.get_buffer(), wordCount + shiftWords,
-                          shiftWords);
-    ShiftWordsLeftByBits(reg_+shiftWords, wordCount+BitsToWords(shiftBits),
-                         shiftBits);
-    return *this;
-}
-
-Integer& Integer::operator>>=(unsigned int n)
-{
-    const unsigned int wordCount = WordCount();
-    const unsigned int shiftWords = n / WORD_BITS;
-    const unsigned int shiftBits = n % WORD_BITS;
-
-    ShiftWordsRightByWords(reg_.get_buffer(), wordCount, shiftWords);
-    if (wordCount > shiftWords)
-        ShiftWordsRightByBits(reg_.get_buffer(), wordCount-shiftWords,
-                              shiftBits);
-    if (IsNegative() && WordCount()==0)   // avoid -0
-        *this = Zero();
-    return *this;
-}
-
-
-void PositiveAdd(Integer& sum, const Integer& a, const Integer& b)
-{
-    word carry;
-    if (a.reg_.size() == b.reg_.size())
-        carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
-                    b.reg_.get_buffer(), a.reg_.size());
-    else if (a.reg_.size() > b.reg_.size())
-    {
-        carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
-                    b.reg_.get_buffer(), b.reg_.size());
-        CopyWords(sum.reg_+b.reg_.size(), a.reg_+b.reg_.size(),
-                  a.reg_.size()-b.reg_.size());
-        carry = Increment(sum.reg_+b.reg_.size(), a.reg_.size()-b.reg_.size(),
-                          carry);
-    }
-    else
-    {
-        carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
-                    b.reg_.get_buffer(), a.reg_.size());
-        CopyWords(sum.reg_+a.reg_.size(), b.reg_+a.reg_.size(),
-                  b.reg_.size()-a.reg_.size());
-        carry = Increment(sum.reg_+a.reg_.size(), b.reg_.size()-a.reg_.size(),
-                          carry);
-    }
-
-    if (carry)
-    {
-        sum.reg_.CleanGrow(2*sum.reg_.size());
-        sum.reg_[sum.reg_.size()/2] = 1;
-    }
-    sum.sign_ = Integer::POSITIVE;
-}
-
-void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
-{
-    unsigned aSize = a.WordCount();
-    aSize += aSize%2;
-    unsigned bSize = b.WordCount();
-    bSize += bSize%2;
-
-    if (aSize == bSize)
-    {
-        if (Compare(a.reg_.get_buffer(), b.reg_.get_buffer(), aSize) >= 0)
-        {
-            Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
-                     b.reg_.get_buffer(), aSize);
-            diff.sign_ = Integer::POSITIVE;
-        }
-        else
-        {
-            Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
-                     a.reg_.get_buffer(), aSize);
-            diff.sign_ = Integer::NEGATIVE;
-        }
-    }
-    else if (aSize > bSize)
-    {
-        word borrow = Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
-                               b.reg_.get_buffer(), bSize);
-        CopyWords(diff.reg_+bSize, a.reg_+bSize, aSize-bSize);
-        borrow = Decrement(diff.reg_+bSize, aSize-bSize, borrow);
-        diff.sign_ = Integer::POSITIVE;
-    }
-    else
-    {
-        word borrow = Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
-                               a.reg_.get_buffer(), aSize);
-        CopyWords(diff.reg_+aSize, b.reg_+aSize, bSize-aSize);
-        borrow = Decrement(diff.reg_+aSize, bSize-aSize, borrow);
-        diff.sign_ = Integer::NEGATIVE;
-    }
-}
-
-
-unsigned int Integer::MinEncodedSize(Signedness signedness) const
-{
-    unsigned int outputLen = max(1U, ByteCount());
-    if (signedness == UNSIGNED)
-        return outputLen;
-    if (NotNegative() && (GetByte(outputLen-1) & 0x80))
-        outputLen++;
-    if (IsNegative() && *this < -Power2(outputLen*8-1))
-        outputLen++;
-    return outputLen;
-}
-
-
-int Integer::Compare(const Integer& t) const
-{
-    if (NotNegative())
-    {
-        if (t.NotNegative())
-            return PositiveCompare(t);
-        else
-            return 1;
-    }
-    else
-    {
-        if (t.NotNegative())
-            return -1;
-        else
-            return -PositiveCompare(t);
-    }
-}
-
-
-int Integer::PositiveCompare(const Integer& t) const
-{
-    unsigned size = WordCount(), tSize = t.WordCount();
-
-    if (size == tSize)
-        return TaoCrypt::Compare(reg_.get_buffer(), t.reg_.get_buffer(), size);
-    else
-        return size > tSize ? 1 : -1;
-}
-
-
-bool Integer::GetBit(unsigned int n) const
-{
-    if (n/WORD_BITS >= reg_.size())
-        return 0;
-    else
-        return bool((reg_[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
-}
-
-
-unsigned long Integer::GetBits(unsigned int i, unsigned int n) const
-{
-    unsigned long v = 0;
-    for (unsigned int j=0; j<n; j++)
-        v |= GetBit(i+j) << j;
-    return v;
-}
-
-
-byte Integer::GetByte(unsigned int n) const
-{
-    if (n/WORD_SIZE >= reg_.size())
-        return 0;
-    else
-        return byte(reg_[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
-}
-
-
-unsigned int Integer::BitCount() const
-{
-    unsigned wordCount = WordCount();
-    if (wordCount)
-        return (wordCount-1)*WORD_BITS + BitPrecision(reg_[wordCount-1]);
-    else
-        return 0;
-}
-
-
-unsigned int Integer::ByteCount() const
-{
-    unsigned wordCount = WordCount();
-    if (wordCount)
-        return (wordCount-1)*WORD_SIZE + BytePrecision(reg_[wordCount-1]);
-    else
-        return 0;
-}
-
-
-unsigned int Integer::WordCount() const
-{
-    return CountWords(reg_.get_buffer(), reg_.size());
-}
-
-
-bool Integer::IsConvertableToLong() const
-{
-    if (ByteCount() > sizeof(long))
-        return false;
-
-    unsigned long value = reg_[0];
-    value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
-
-    if (sign_ == POSITIVE)
-        return (signed long)value >= 0;
-    else
-        return -(signed long)value < 0;
-}
-
-
-signed long Integer::ConvertToLong() const
-{
-    unsigned long value = reg_[0];
-    value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
-    return sign_ == POSITIVE ? value : -(signed long)value;
-}
-
-
-void Integer::Swap(Integer& a)
-{
-    reg_.Swap(a.reg_);
-    STL::swap(sign_, a.sign_);
-}
-
-
-Integer Integer::Plus(const Integer& b) const
-{
-    Integer sum((word)0, max(reg_.size(), b.reg_.size()));
-    if (NotNegative())
-    {
-        if (b.NotNegative())
-            PositiveAdd(sum, *this, b);
-        else
-            PositiveSubtract(sum, *this, b);
-    }
-    else
-    {
-        if (b.NotNegative())
-            PositiveSubtract(sum, b, *this);
-        else
-        {
-            PositiveAdd(sum, *this, b);
-            sum.sign_ = Integer::NEGATIVE;
-        }
-    }
-    return sum;
-}
-
-
-Integer Integer::Minus(const Integer& b) const
-{
-    Integer diff((word)0, max(reg_.size(), b.reg_.size()));
-    if (NotNegative())
-    {
-        if (b.NotNegative())
-            PositiveSubtract(diff, *this, b);
-        else
-            PositiveAdd(diff, *this, b);
-    }
-    else
-    {
-        if (b.NotNegative())
-        {
-            PositiveAdd(diff, *this, b);
-            diff.sign_ = Integer::NEGATIVE;
-        }
-        else
-            PositiveSubtract(diff, b, *this);
-    }
-    return diff;
-}
-
-
-Integer Integer::Times(const Integer &b) const
-{
-    Integer product;
-    Multiply(product, *this, b);
-    return product;
-}
-
-
-#undef A0
-#undef A1
-#undef B0
-#undef B1
-
-#undef T0
-#undef T1
-#undef T2
-#undef T3
-
-#undef R0
-#undef R1
-#undef R2
-#undef R3
-
-
-static inline void AtomicDivide(word *Q, const word *A, const word *B)
-{
-    word T[4];
-    DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]),
-                                         DWord(A[2], A[3]), DWord(B[0], B[1]));
-    Q[0] = q.GetLowHalf();
-    Q[1] = q.GetHighHalf();
-
-#ifndef NDEBUG
-    if (B[0] || B[1])
-    {
-        // multiply quotient and divisor and add remainder, make sure it 
-        // equals dividend
-        word P[4];
-        Portable::Multiply2(P, Q, B);
-        Add(P, P, T, 4);
-    }
-#endif
-}
-
-
-// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
-static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B,
-                                    unsigned int N)
-{
-    if (Q[1])
-    {
-        T[N] = T[N+1] = 0;
-        unsigned i;
-        for (i=0; i<N; i+=4)
-            LowLevel::Multiply2(T+i, Q, B+i);
-        for (i=2; i<N; i+=4)
-            if (LowLevel::Multiply2Add(T+i, Q, B+i))
-                T[i+5] += (++T[i+4]==0);
-    }
-    else
-    {
-        T[N] = LinearMultiply(T, B, Q[0], N);
-        T[N+1] = 0;
-    }
-
-    word borrow = Subtract(R, R, T, N+2);
-    (void)borrow;       // shut up compiler
-
-    while (R[N] || Compare(R, B, N) >= 0)
-    {
-        R[N] -= Subtract(R, R, B, N);
-        Q[1] += (++Q[0]==0);
-    }
-}
-
-// R[NB] -------- remainder = A%B
-// Q[NA-NB+2] --- quotient	= A/B
-// T[NA+2*NB+4] - temp work space
-// A[NA] -------- dividend
-// B[NB] -------- divisor
-
-
-void Divide(word* R, word* Q, word* T, const word* A, unsigned int NA,
-            const word* B, unsigned int NB)
-{
-    // set up temporary work space
-    word *const TA=T;
-    word *const TB=T+NA+2;
-    word *const TP=T+NA+2+NB;
-
-    // copy B into TB and normalize it so that TB has highest bit set to 1
-    unsigned shiftWords = (B[NB-1]==0);
-    TB[0] = TB[NB-1] = 0;
-    CopyWords(TB+shiftWords, B, NB-shiftWords);
-    unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
-    ShiftWordsLeftByBits(TB, NB, shiftBits);
-
-    // copy A into TA and normalize it
-    TA[0] = TA[NA] = TA[NA+1] = 0;
-    CopyWords(TA+shiftWords, A, NA);
-    ShiftWordsLeftByBits(TA, NA+2, shiftBits);
-
-    if (TA[NA+1]==0 && TA[NA] <= 1)
-    {
-        Q[NA-NB+1] = Q[NA-NB] = 0;
-        while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
-        {
-            TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
-            ++Q[NA-NB];
-        }
-    }
-    else
-    {
-        NA+=2;
-    }
-
-    word BT[2];
-    BT[0] = TB[NB-2] + 1;
-    BT[1] = TB[NB-1] + (BT[0]==0);
-
-    // start reducing TA mod TB, 2 words at a time
-    for (unsigned i=NA-2; i>=NB; i-=2)
-    {
-        AtomicDivide(Q+i-NB, TA+i-2, BT);
-        CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
-    }
-
-    // copy TA into R, and denormalize it
-    CopyWords(R, TA+shiftWords, NB);
-    ShiftWordsRightByBits(R, NB, shiftBits);
-}
-
-
-void PositiveDivide(Integer& remainder, Integer& quotient,
-                   const Integer& a, const Integer& b)
-{
-    unsigned aSize = a.WordCount();
-    unsigned bSize = b.WordCount();
-
-    if (a.PositiveCompare(b) == -1)
-    {
-        remainder = a;
-        remainder.sign_ = Integer::POSITIVE;
-        quotient = Integer::Zero();
-        return;
-    }
-
-    aSize += aSize%2;	// round up to next even number
-    bSize += bSize%2;
-
-    remainder.reg_.CleanNew(RoundupSize(bSize));
-    remainder.sign_ = Integer::POSITIVE;
-    quotient.reg_.CleanNew(RoundupSize(aSize-bSize+2));
-    quotient.sign_ = Integer::POSITIVE;
-
-    AlignedWordBlock T(aSize+2*bSize+4);
-    Divide(remainder.reg_.get_buffer(), quotient.reg_.get_buffer(),
-           T.get_buffer(), a.reg_.get_buffer(), aSize, b.reg_.get_buffer(),
-           bSize);
-}
-
-void Integer::Divide(Integer &remainder, Integer &quotient,
-                     const Integer &dividend, const Integer &divisor)
-{
-    PositiveDivide(remainder, quotient, dividend, divisor);
-
-    if (dividend.IsNegative())
-    {
-        quotient.Negate();
-        if (remainder.NotZero())
-        {
-            --quotient;
-            remainder = divisor.AbsoluteValue() - remainder;
-        }
-    }
-
-    if (divisor.IsNegative())
-        quotient.Negate();
-}
-
-void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a,
-                               unsigned int n)
-{
-    q = a;
-    q >>= n;
-
-    const unsigned int wordCount = BitsToWords(n);
-    if (wordCount <= a.WordCount())
-    {
-        r.reg_.resize(RoundupSize(wordCount));
-        CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), wordCount);
-        SetWords(r.reg_+wordCount, 0, r.reg_.size()-wordCount);
-        if (n % WORD_BITS != 0)
-          r.reg_[wordCount-1] %= (word(1) << (n % WORD_BITS));
-    }
-    else
-    {
-        r.reg_.resize(RoundupSize(a.WordCount()));
-        CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), r.reg_.size());
-    }
-    r.sign_ = POSITIVE;
-
-    if (a.IsNegative() && r.NotZero())
-    {
-        --q;
-        r = Power2(n) - r;
-    }
-}
-
-Integer Integer::DividedBy(const Integer &b) const
-{
-    Integer remainder, quotient;
-    Integer::Divide(remainder, quotient, *this, b);
-    return quotient;
-}
-
-Integer Integer::Modulo(const Integer &b) const
-{
-    Integer remainder, quotient;
-    Integer::Divide(remainder, quotient, *this, b);
-    return remainder;
-}
-
-void Integer::Divide(word &remainder, Integer &quotient,
-                     const Integer &dividend, word divisor)
-{
-    if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
-    {
-        quotient = dividend >> (BitPrecision(divisor)-1);
-        remainder = dividend.reg_[0] & (divisor-1);
-        return;
-    }
-
-    unsigned int i = dividend.WordCount();
-    quotient.reg_.CleanNew(RoundupSize(i));
-    remainder = 0;
-    while (i--)
-    {
-        quotient.reg_[i] = DWord(dividend.reg_[i], remainder) / divisor;
-        remainder = DWord(dividend.reg_[i], remainder) % divisor;
-    }
-
-    if (dividend.NotNegative())
-        quotient.sign_ = POSITIVE;
-    else
-    {
-        quotient.sign_ = NEGATIVE;
-        if (remainder)
-        {
-            --quotient;
-            remainder = divisor - remainder;
-        }
-    }
-}
-
-Integer Integer::DividedBy(word b) const
-{
-    word remainder;
-    Integer quotient;
-    Integer::Divide(remainder, quotient, *this, b);
-    return quotient;
-}
-
-word Integer::Modulo(word divisor) const
-{
-    word remainder;
-
-    if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
-        remainder = reg_[0] & (divisor-1);
-    else
-    {
-        unsigned int i = WordCount();
-
-        if (divisor <= 5)
-        {
-            DWord sum(0, 0);
-            while (i--)
-                sum += reg_[i];
-            remainder = sum % divisor;
-        }
-        else
-        {
-            remainder = 0;
-            while (i--)
-                remainder = DWord(reg_[i], remainder) % divisor;
-        }
-    }
-
-    if (IsNegative() && remainder)
-        remainder = divisor - remainder;
-
-    return remainder;
-}
-
-
-Integer Integer::AbsoluteValue() const
-{
-    Integer result(*this);
-    result.sign_ = POSITIVE;
-    return result;
-}
-
-
-Integer Integer::SquareRoot() const
-{
-    if (!IsPositive())
-        return Zero();
-
-    // overestimate square root
-    Integer x, y = Power2((BitCount()+1)/2);
-
-    do
-    {
-        x = y;
-        y = (x + *this/x) >> 1;
-    } while (y<x);
-
-    return x;
-}
-
-bool Integer::IsSquare() const
-{
-    Integer r = SquareRoot();
-    return *this == r.Squared();
-}
-
-bool Integer::IsUnit() const
-{
-    return (WordCount() == 1) && (reg_[0] == 1);
-}
-
-Integer Integer::MultiplicativeInverse() const
-{
-    return IsUnit() ? *this : Zero();
-}
-
-Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
-{
-    return x*y%m;
-}
-
-Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
-{
-    ModularArithmetic mr(m);
-    return mr.Exponentiate(x, e);
-}
-
-Integer Integer::Gcd(const Integer &a, const Integer &b)
-{
-    return EuclideanDomainOf().Gcd(a, b);
-}
-
-Integer Integer::InverseMod(const Integer &m) const
-{
-    if (IsNegative() || *this>=m)
-        return (*this%m).InverseMod(m);
-
-    if (m.IsEven())
-    {
-        if (!m || IsEven())
-            return Zero();	// no inverse
-        if (*this == One())
-            return One();
-
-        Integer u = m.InverseMod(*this);
-        return !u ? Zero() : (m*(*this-u)+1)/(*this);
-    }
-
-    AlignedWordBlock T(m.reg_.size() * 4);
-    Integer r((word)0, m.reg_.size());
-    unsigned k = AlmostInverse(r.reg_.get_buffer(), T.get_buffer(),
-                               reg_.get_buffer(), reg_.size(),
-                               m.reg_.get_buffer(), m.reg_.size());
-    DivideByPower2Mod(r.reg_.get_buffer(), r.reg_.get_buffer(), k,
-                      m.reg_.get_buffer(), m.reg_.size());
-    return r;
-}
-
-word Integer::InverseMod(const word mod) const
-{
-    word g0 = mod, g1 = *this % mod;
-    word v0 = 0, v1 = 1;
-    word y;
-
-    while (g1)
-    {
-        if (g1 == 1)
-            return v1;
-        y = g0 / g1;
-        g0 = g0 % g1;
-        v0 += y * v1;
-
-        if (!g0)
-            break;
-        if (g0 == 1)
-            return mod-v0;
-        y = g1 / g0;
-        g1 = g1 % g0;
-        v1 += y * v0;
-    }
-    return 0;
-}
-
-// ********* ModArith stuff
-
-const Integer& ModularArithmetic::Half(const Integer &a) const
-{
-    if (a.reg_.size()==modulus.reg_.size())
-    {
-        TaoCrypt::DivideByPower2Mod(result.reg_.begin(), a.reg_.begin(), 1,
-                                    modulus.reg_.begin(), a.reg_.size());
-        return result;
-    }
-    else
-        return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1));
-}
-
-const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
-{
-    if (a.reg_.size()==modulus.reg_.size() && 
-        b.reg_.size()==modulus.reg_.size())
-    {
-        if (TaoCrypt::Add(result.reg_.begin(), a.reg_.begin(), b.reg_.begin(),
-                          a.reg_.size())
-            || Compare(result.reg_.get_buffer(), modulus.reg_.get_buffer(),
-                       a.reg_.size()) >= 0)
-        {
-            TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
-                               modulus.reg_.begin(), a.reg_.size());
-        }
-        return result;
-    }
-    else
-    {
-        result1 = a+b;
-        if (result1 >= modulus)
-            result1 -= modulus;
-        return result1;
-    }
-}
-
-Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
-{
-    if (a.reg_.size()==modulus.reg_.size() && 
-        b.reg_.size()==modulus.reg_.size())
-    {
-        if (TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
-                          b.reg_.get_buffer(), a.reg_.size())
-            || Compare(a.reg_.get_buffer(), modulus.reg_.get_buffer(),
-                       a.reg_.size()) >= 0)
-        {
-            TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
-                               modulus.reg_.get_buffer(), a.reg_.size());
-        }
-    }
-    else
-    {
-        a+=b;
-        if (a>=modulus)
-            a-=modulus;
-    }
-
-    return a;
-}
-
-const Integer& ModularArithmetic::Subtract(const Integer &a,
-                                           const Integer &b) const
-{
-    if (a.reg_.size()==modulus.reg_.size() && 
-        b.reg_.size()==modulus.reg_.size())
-    {
-        if (TaoCrypt::Subtract(result.reg_.begin(), a.reg_.begin(),
-                               b.reg_.begin(), a.reg_.size()))
-            TaoCrypt::Add(result.reg_.begin(), result.reg_.begin(),
-                          modulus.reg_.begin(), a.reg_.size());
-        return result;
-    }
-    else
-    {
-        result1 = a-b;
-        if (result1.IsNegative())
-            result1 += modulus;
-        return result1;
-    }
-}
-
-Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
-{
-    if (a.reg_.size()==modulus.reg_.size() && 
-        b.reg_.size()==modulus.reg_.size())
-    {
-        if (TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
-                               b.reg_.get_buffer(), a.reg_.size()))
-            TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
-                          modulus.reg_.get_buffer(), a.reg_.size());
-    }
-    else
-    {
-        a-=b;
-        if (a.IsNegative())
-            a+=modulus;
-    }
-
-    return a;
-}
-
-const Integer& ModularArithmetic::Inverse(const Integer &a) const
-{
-    if (!a)
-        return a;
-
-    CopyWords(result.reg_.begin(), modulus.reg_.begin(), modulus.reg_.size());
-    if (TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
-                           a.reg_.begin(), a.reg_.size()))
-        Decrement(result.reg_.begin()+a.reg_.size(), 1,
-                  modulus.reg_.size()-a.reg_.size());
-
-    return result;
-}
-
-Integer ModularArithmetic::CascadeExponentiate(const Integer &x,
-                  const Integer &e1, const Integer &y, const Integer &e2) const
-{
-    if (modulus.IsOdd())
-    {
-        MontgomeryRepresentation dr(modulus);
-        return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1,
-                                                    dr.ConvertIn(y), e2));
-    }
-    else
-        return AbstractRing::CascadeExponentiate(x, e1, y, e2);
-}
-
-void ModularArithmetic::SimultaneousExponentiate(Integer *results,
-        const Integer &base, const Integer *exponents,
-        unsigned int exponentsCount) const
-{
-    if (modulus.IsOdd())
-    {
-        MontgomeryRepresentation dr(modulus);
-        dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents,
-                                    exponentsCount);
-        for (unsigned int i=0; i<exponentsCount; i++)
-            results[i] = dr.ConvertOut(results[i]);
-    }
-    else
-        AbstractRing::SimultaneousExponentiate(results, base,
-                                                    exponents, exponentsCount);
-}
-
-
-// ********************************************************
-
-#define A0      A
-#define A1      (A+N2)
-#define B0      B
-#define B1      (B+N2)
-
-#define T0      T
-#define T1      (T+N2)
-#define T2      (T+N)
-#define T3      (T+N+N2)
-
-#define R0      R
-#define R1      (R+N2)
-#define R2      (R+N)
-#define R3      (R+N+N2)
-
-
-inline void MultiplyBottom(word *R, word *T, const word *A, const word *B,
-                           unsigned int N)
-{
-    RecursiveMultiplyBottom(R, T, A, B, N);
-}
-
-inline void MultiplyTop(word *R, word *T, const word *L, const word *A,
-                        const word *B, unsigned int N)
-{
-    RecursiveMultiplyTop(R, T, L, A, B, N);
-}
-
-
-// R[N] --- result = X/(2**(WORD_BITS*N)) mod M
-// T[3*N] - temporary work space
-// X[2*N] - number to be reduced
-// M[N] --- modulus
-// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
-
-void MontgomeryReduce(word *R, word *T, const word *X, const word *M,
-                      const word *U, unsigned int N)
-{
-    MultiplyBottom(R, T, X, U, N);
-    MultiplyTop(T, T+N, X, R, M, N);
-    word borrow = Subtract(T, X+N, T, N);
-    // defend against timing attack by doing this Add even when not needed
-    word carry = Add(T+N, T, M, N);
-    (void)carry;            // shut up compiler
-    CopyWords(R, T + (borrow ? N : 0), N);
-}
-
-// R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
-// T[3*N/2] - temporary work space
-// A[N] ----- an odd number as input
-
-void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
-{
-    if (N==2)
-    {
-        T[0] = AtomicInverseModPower2(A[0]);
-        T[1] = 0;
-        LowLevel::Multiply2Bottom(T+2, T, A);
-        TwosComplement(T+2, 2);
-        Increment(T+2, 2, 2);
-        LowLevel::Multiply2Bottom(R, T, T+2);
-    }
-    else
-    {
-        const unsigned int N2 = N/2;
-        RecursiveInverseModPower2(R0, T0, A0, N2);
-        T0[0] = 1;
-        SetWords(T0+1, 0, N2-1);
-        MultiplyTop(R1, T1, T0, R0, A0, N2);
-        MultiplyBottom(T0, T1, R0, A1, N2);
-        Add(T0, R1, T0, N2);
-        TwosComplement(T0, N2);
-        MultiplyBottom(R1, T1, R0, T0, N2);
-    }
-}
-
-
-#undef A0
-#undef A1
-#undef B0
-#undef B1
-
-#undef T0
-#undef T1
-#undef T2
-#undef T3
-
-#undef R0
-#undef R1
-#undef R2
-#undef R3
-
-
-// modulus must be odd
-MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)
-    : ModularArithmetic(m),
-      u((word)0, modulus.reg_.size()),
-      workspace(5*modulus.reg_.size())
-{
-    RecursiveInverseModPower2(u.reg_.get_buffer(), workspace.get_buffer(),
-                              modulus.reg_.get_buffer(), modulus.reg_.size());
-}
-
-const Integer& MontgomeryRepresentation::Multiply(const Integer &a,
-                                                  const Integer &b) const
-{
-    word *const T = workspace.begin();
-    word *const R = result.reg_.begin();
-    const unsigned int N = modulus.reg_.size();
-
-    AsymmetricMultiply(T, T+2*N, a.reg_.get_buffer(), a.reg_.size(),
-                       b.reg_.get_buffer(), b.reg_.size());
-    SetWords(T+a.reg_.size()+b.reg_.size(),0, 2*N-a.reg_.size()-b.reg_.size());
-    MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
-                     u.reg_.get_buffer(), N);
-    return result;
-}
-
-const Integer& MontgomeryRepresentation::Square(const Integer &a) const
-{
-    word *const T = workspace.begin();
-    word *const R = result.reg_.begin();
-    const unsigned int N = modulus.reg_.size();
-
-    TaoCrypt::Square(T, T+2*N, a.reg_.get_buffer(), a.reg_.size());
-    SetWords(T+2*a.reg_.size(), 0, 2*N-2*a.reg_.size());
-    MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
-                     u.reg_.get_buffer(), N);
-    return result;
-}
-
-Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
-{
-    word *const T = workspace.begin();
-    word *const R = result.reg_.begin();
-    const unsigned int N = modulus.reg_.size();
-
-    CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
-    SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
-    MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
-                     u.reg_.get_buffer(), N);
-    return result;
-}
-
-const Integer& MontgomeryRepresentation::MultiplicativeInverse(
-                                                        const Integer &a) const
-{
-//  return (EuclideanMultiplicativeInverse(a, modulus)<<
-//      (2*WORD_BITS*modulus.reg_.size()))%modulus;
-    word *const T = workspace.begin();
-    word *const R = result.reg_.begin();
-    const unsigned int N = modulus.reg_.size();
-
-    CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
-    SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
-    MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
-                     u.reg_.get_buffer(), N);
-    unsigned k = AlmostInverse(R, T, R, N, modulus.reg_.get_buffer(), N);
-
-//  cout << "k=" << k << " N*32=" << 32*N << endl;
-
-    if (k>N*WORD_BITS)
-        DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg_.get_buffer(), N);
-    else
-        MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg_.get_buffer(), N);
-
-    return result;
-}
-
-
-//  mod Root stuff
-Integer ModularRoot(const Integer &a, const Integer &dp, const Integer &dq,
-                    const Integer &p, const Integer &q, const Integer &u)
-{
-    Integer p2 = ModularExponentiation((a % p), dp, p);
-    Integer q2 = ModularExponentiation((a % q), dq, q);
-    return CRT(p2, p, q2, q, u);
-}
-
-Integer CRT(const Integer &xp, const Integer &p, const Integer &xq,
-            const Integer &q, const Integer &u)
-{
-    // isn't operator overloading great?
-    return p * (u * (xq-xp) % q) + xp;
-}
-
-
-#ifdef HAVE_EXPLICIT_TEMPLATE_INSTANTIATION
-#ifndef TAOCRYPT_NATIVE_DWORD_AVAILABLE
-template hword DivideThreeWordsByTwo<hword, Word>(hword*, hword, hword, Word*);
-#endif
-template word DivideThreeWordsByTwo<word, DWord>(word*, word, word, DWord*);
-#ifdef SSE2_INTRINSICS_AVAILABLE
-template class AlignedAllocator<word>;
-#endif
-#endif
-
-
-} // namespace
-
-- 
cgit v1.1