monocypher.c

      -9423865,-12437364,-663000,-31111463,-16132436,},
     {25576264,-2703214,7349804,-11814844,16472782,
      9300885,3844789,15725684,171356,6466918,},
     {23103977,13316479,9739013,-16149481,817875,
      -15038942,8965339,-14088058,-30714912,16193877,},},
    {{-33521811,3180713,-2394130,14003687,-16903474,
      -16270840,17238398,4729455,-18074513,9256800,},
     {-25182317,-4174131,32336398,5036987,-21236817,
      11360617,22616405,9761698,-19827198,630305,},
     {-13720693,2639453,-24237460,-7406481,9494427,
      -5774029,-6554551,-15960994,-2449256,-14291300,},},
    {{-3151181,-5046075,9282714,6866145,-31907062,
      -863023,-18940575,15033784,25105118,-7894876,},
     {-24326370,15950226,-31801215,-14592823,-11662737,
      -5090925,1573892,-2625887,2198790,-15804619,},
     {-3099351,10324967,-2241613,7453183,-5446979,
      -2735503,-13812022,-16236442,-32461234,-12290683,},},
};

// Incremental sliding windows (left to right)
// Based on Roberto Maria Avanzi[2005]
typedef struct {
    i16 next_index; // position of the next signed digit
    i8  next_digit; // next signed digit (odd number below 2^window_width)
    u8  next_check; // point at which we must check for a new window
} slide_ctx;

static void slide_init(slide_ctx *ctx, const u8 scalar[32])
{
    // scalar is guaranteed to be below L, either because we checked (s),
    // or because we reduced it modulo L (h_ram). L is under 2^253, so
    // so bits 253 to 255 are guaranteed to be zero. No need to test them.
    //
    // Note however that L is very close to 2^252, so bit 252 is almost
    // always zero.  If we were to start at bit 251, the tests wouldn't
    // catch the off-by-one error (constructing one that does would be
    // prohibitively expensive).
    //
    // We should still check bit 252, though.
    int i = 252;
    while (i > 0 && scalar_bit(scalar, i) == 0) {
        i--;
    }
    ctx->next_check = (u8)(i + 1);
    ctx->next_index = -1;
    ctx->next_digit = -1;
}

static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
{
    if (i == ctx->next_check) {
        if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
            ctx->next_check--;
        } else {
            // compute digit of next window
            int w = MIN(width, i + 1);
            int v = -(scalar_bit(scalar, i) << (w-1));
            FOR_T (int, j, 0, w-1) {
                v += scalar_bit(scalar, i-(w-1)+j) << j;
            }
            v += scalar_bit(scalar, i-w);
            int lsb = v & (~v + 1);            // smallest bit of v
            int s   = (   ((lsb & 0xAA) != 0)  // log2(lsb)
                       | (((lsb & 0xCC) != 0) << 1)
                       | (((lsb & 0xF0) != 0) << 2));
            ctx->next_index  = (i16)(i-(w-1)+s);
            ctx->next_digit  = (i8) (v >> s   );
            ctx->next_check -= (u8) w;
        }
    }
    return i == ctx->next_index ? ctx->next_digit: 0;
}

#define P_W_WIDTH 3 // Affects the size of the stack
#define B_W_WIDTH 5 // Affects the size of the binary
#define P_W_SIZE  (1<<(P_W_WIDTH-2))

// P = [b]B + [p]P, where B is the base point
//
// Variable time! Internal buffers are not wiped! Inputs must not be secret!
// => Use only to *check* signatures.
static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32])
{
    // cache P window for addition
    ge_cached cP[P_W_SIZE];
    {
        ge P2, tmp;
        ge_double(&P2, P, &tmp);
        ge_cache(&cP[0], P);
        FOR (i, 1, P_W_SIZE) {
            ge_add(&tmp, &P2, &cP[i-1]);
            ge_cache(&cP[i], &tmp);
        }
    }

    // Merged double and add ladder, fused with sliding
    slide_ctx p_slide;  slide_init(&p_slide, p);
    slide_ctx b_slide;  slide_init(&b_slide, b);
    int i = MAX(p_slide.next_check, b_slide.next_check);
    ge *sum = P;
    ge_zero(sum);
    while (i >= 0) {
        ge tmp;
        ge_double(sum, sum, &tmp);
        int p_digit = slide_step(&p_slide, P_W_WIDTH, i, p);
        int b_digit = slide_step(&b_slide, B_W_WIDTH, i, b);
        if (p_digit > 0) { ge_add(sum, sum, &cP[ p_digit / 2]); }
        if (p_digit < 0) { ge_sub(sum, sum, &cP[-p_digit / 2]); }
        fe t1, t2;
        if (b_digit > 0) { ge_madd(sum, sum, b_window +  b_digit/2, t1, t2); }
        if (b_digit < 0) { ge_msub(sum, sum, b_window + -b_digit/2, t1, t2); }
        i--;
    }
}

// R_check = s[B] - h_ram[pk], where B is the base point
//
// Variable time! Internal buffers are not wiped! Inputs must not be secret!
// => Use only to *check* signatures.
static int ge_r_check(u8 R_check[32], u8 s[32], u8 h_ram[32], u8 pk[32])
{
    ge  A;      // not secret, not wiped
    u32 s32[8]; // not secret, not wiped
    load32_le_buf(s32, s, 8);
    if (ge_frombytes_vartime(&A, pk) ||         // A = pk
        is_above_l(s32)) {                      // prevent s malleability
        return -1;
    }
    fe_neg(A.X, A.X);
    fe_neg(A.T, A.T);                           // A = -pk
    ge_double_scalarmult_vartime(&A, h_ram, s); // A = [s]B - [h_ram]pk
    ge_tobytes(R_check, &A);                    // R_check = A
    return 0;
}

// 5-bit signed comb in cached format (Niels coordinates, Z=1)
static const ge_precomp b_comb_low[8] = {
    {{-6816601,-2324159,-22559413,124364,18015490,
      8373481,19993724,1979872,-18549925,9085059,},
     {10306321,403248,14839893,9633706,8463310,
      -8354981,-14305673,14668847,26301366,2818560,},
     {-22701500,-3210264,-13831292,-2927732,-16326337,
      -14016360,12940910,177905,12165515,-2397893,},},
    {{-12282262,-7022066,9920413,-3064358,-32147467,
      2927790,22392436,-14852487,2719975,16402117,},
     {-7236961,-4729776,2685954,-6525055,-24242706,
      -15940211,-6238521,14082855,10047669,12228189,},
     {-30495588,-12893761,-11161261,3539405,-11502464,
      16491580,-27286798,-15030530,-7272871,-15934455,},},
    {{17650926,582297,-860412,-187745,-12072900,
      -10683391,-20352381,15557840,-31072141,-5019061,},
     {-6283632,-2259834,-4674247,-4598977,-4089240,
      12435688,-31278303,1060251,6256175,10480726,},
     {-13871026,2026300,-21928428,-2741605,-2406664,
      -8034988,7355518,15733500,-23379862,7489131,},},
    {{6883359,695140,23196907,9644202,-33430614,
      11354760,-20134606,6388313,-8263585,-8491918,},
     {-7716174,-13605463,-13646110,14757414,-19430591,
      -14967316,10359532,-11059670,-21935259,12082603,},
     {-11253345,-15943946,10046784,5414629,24840771,
      8086951,-6694742,9868723,15842692,-16224787,},},
    {{9639399,11810955,-24007778,-9320054,3912937,
      -9856959,996125,-8727907,-8919186,-14097242,},
     {7248867,14468564,25228636,-8795035,14346339,
      8224790,6388427,-7181107,6468218,-8720783,},
     {15513115,15439095,7342322,-10157390,18005294,
      -7265713,2186239,4884640,10826567,7135781,},},
    {{-14204238,5297536,-5862318,-6004934,28095835,
      4236101,-14203318,1958636,-16816875,3837147,},
     {-5511166,-13176782,-29588215,12339465,15325758,
      -15945770,-8813185,11075932,-19608050,-3776283,},
     {11728032,9603156,-4637821,-5304487,-7827751,
      2724948,31236191,-16760175,-7268616,14799772,},},
    {{-28842672,4840636,-12047946,-9101456,-1445464,
      381905,-30977094,-16523389,1290540,12798615,},
     {27246947,-10320914,14792098,-14518944,5302070,
      -8746152,-3403974,-4149637,-27061213,10749585,},
     {25572375,-6270368,-15353037,16037944,1146292,
      32198,23487090,9585613,24714571,-1418265,},},
    {{19844825,282124,-17583147,11004019,-32004269,
      -2716035,6105106,-1711007,-21010044,14338445,},
     {8027505,8191102,-18504907,-12335737,25173494,
      -5923905,15446145,7483684,-30440441,10009108,},
     {-14134701,-4174411,10246585,-14677495,33553567,
      -14012935,23366126,15080531,-7969992,7663473,},},
};

static const ge_precomp b_comb_high[8] = {
    {{33055887,-4431773,-521787,6654165,951411,
      -6266464,-5158124,6995613,-5397442,-6985227,},
     {4014062,6967095,-11977872,3960002,8001989,
      5130302,-2154812,-1899602,-31954493,-16173976,},
     {16271757,-9212948,23792794,731486,-25808309,
      -3546396,6964344,-4767590,10976593,10050757,},},
    {{2533007,-4288439,-24467768,-12387405,-13450051,
      14542280,12876301,13893535,15067764,8594792,},
     {20073501,-11623621,3165391,-13119866,13188608,
      -11540496,-10751437,-13482671,29588810,2197295,},
     {-1084082,11831693,6031797,14062724,14748428,
      -8159962,-20721760,11742548,31368706,13161200,},},
    {{2050412,-6457589,15321215,5273360,25484180,
      124590,-18187548,-7097255,-6691621,-14604792,},
     {9938196,2162889,-6158074,-1711248,4278932,
      -2598531,-22865792,-7168500,-24323168,11746309,},
     {-22691768,-14268164,5965485,9383325,20443693,
      5854192,28250679,-1381811,-10837134,13717818,},},
    {{-8495530,16382250,9548884,-4971523,-4491811,
      -3902147,6182256,-12832479,26628081,10395408,},
     {27329048,-15853735,7715764,8717446,-9215518,
      -14633480,28982250,-5668414,4227628,242148,},
     {-13279943,-7986904,-7100016,8764468,-27276630,
      3096719,29678419,-9141299,3906709,11265498,},},
    {{11918285,15686328,-17757323,-11217300,-27548967,
      4853165,-27168827,6807359,6871949,-1075745,},
     {-29002610,13984323,-27111812,-2713442,28107359,
      -13266203,6155126,15104658,3538727,-7513788,},
     {14103158,11233913,-33165269,9279850,31014152,
      4335090,-1827936,4590951,13960841,12787712,},},
    {{1469134,-16738009,33411928,13942824,8092558,
      -8778224,-11165065,1437842,22521552,-2792954,},
     {31352705,-4807352,-25327300,3962447,12541566,
      -9399651,-27425693,7964818,-23829869,5541287,},
     {-25732021,-6864887,23848984,3039395,-9147354,
      6022816,-27421653,10590137,25309915,-1584678,},},
    {{-22951376,5048948,31139401,-190316,-19542447,
      -626310,-17486305,-16511925,-18851313,-12985140,},
     {-9684890,14681754,30487568,7717771,-10829709,
      9630497,30290549,-10531496,-27798994,-13812825,},
     {5827835,16097107,-24501327,12094619,7413972,
      11447087,28057551,-1793987,-14056981,4359312,},},
    {{26323183,2342588,-21887793,-1623758,-6062284,
      2107090,-28724907,9036464,-19618351,-13055189,},
     {-29697200,14829398,-4596333,14220089,-30022969,
      2955645,12094100,-13693652,-5941445,7047569,},
     {-3201977,14413268,-12058324,-16417589,-9035655,
      -7224648,9258160,1399236,30397584,-5684634,},},
};

static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b,
                       const ge_precomp comb[8], const u8 scalar[32], int i)
{
    u8 teeth = (u8)((scalar_bit(scalar, i)          ) +
                    (scalar_bit(scalar, i + 32) << 1) +
                    (scalar_bit(scalar, i + 64) << 2) +
                    (scalar_bit(scalar, i + 96) << 3));
    u8 high  = teeth >> 3;
    u8 index = (teeth ^ (high - 1)) & 7;
    FOR (j, 0, 8) {
        i32 select = 1 & (((j ^ index) - 1) >> 8);
        fe_ccopy(tmp_c->Yp, comb[j].Yp, select);
        fe_ccopy(tmp_c->Ym, comb[j].Ym, select);
        fe_ccopy(tmp_c->T2, comb[j].T2, select);
    }
    fe_neg(tmp_a, tmp_c->T2);
    fe_cswap(tmp_c->T2, tmp_a    , high ^ 1);
    fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1);
    ge_madd(p, p, tmp_c, tmp_a, tmp_b);
}

// p = [scalar]B, where B is the base point
static void ge_scalarmult_base(ge *p, const u8 scalar[32])
{
    // twin 4-bits signed combs, from Mike Hamburg's
    // Fast and compact elliptic-curve cryptography (2012)
    // 1 / 2 modulo L
    static const u8 half_mod_L[32] = {
        247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8, };
    // (2^256 - 1) / 2 modulo L
    static const u8 half_ones[32] = {
        142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99,
        255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7, };

    // All bits set form: 1 means 1, 0 means -1
    u8 s_scalar[32];
    mul_add(s_scalar, scalar, half_mod_L, half_ones);

    // Double and add ladder
    fe tmp_a, tmp_b;  // temporaries for addition
    ge_precomp tmp_c; // temporary for comb lookup
    ge tmp_d;         // temporary for doubling
    fe_1(tmp_c.Yp);
    fe_1(tmp_c.Ym);
    fe_0(tmp_c.T2);

    // Save a double on the first iteration
    ge_zero(p);
    lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31);
    lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128);
    // Regular double & add for the rest
    for (int i = 30; i >= 0; i--) {
        ge_double(p, p, &tmp_d);
        lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i);
        lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
    }
    // Note: we could save one addition at the end if we assumed the
    // scalar fit in 252 bit.  Which it does in practice if it is
    // selected at random.  However, non-random, non-hashed scalars
    // *can* overflow 252 bits in practice.  Better account for that
    // than leaving that kind of subtle corner case.

    WIPE_BUFFER(tmp_a);  WIPE_CTX(&tmp_d);
    WIPE_BUFFER(tmp_b);  WIPE_CTX(&tmp_c);
    WIPE_BUFFER(s_scalar);
}

void crypto_sign_public_key_custom_hash(u8       public_key[32],
                                        const u8 secret_key[32],
                                        const crypto_sign_vtable *hash)
{
    u8 a[64];
    hash->hash(a, secret_key, 32);
    trim_scalar(a);
    ge A;
    ge_scalarmult_base(&A, a);
    ge_tobytes(public_key, &A);
    WIPE_BUFFER(a);
    WIPE_CTX(&A);
}

void crypto_sign_public_key(u8 public_key[32], const u8 secret_key[32])
{
    crypto_sign_public_key_custom_hash(public_key, secret_key,
                                       &crypto_blake2b_vtable);
}

void crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract *ctx,
                                             const u8 secret_key[32],
                                             const u8 public_key[32],
                                             const crypto_sign_vtable *hash)
{
    ctx->hash  = hash; // set vtable
    u8 *a      = ctx->buf;
    u8 *prefix = ctx->buf + 32;
    ctx->hash->hash(a, secret_key, 32);
    trim_scalar(a);

    if (public_key == 0) {
        crypto_sign_public_key_custom_hash(ctx->pk, secret_key, ctx->hash);
    } else {
        COPY(ctx->pk, public_key, 32);
    }

    // Deterministic part of EdDSA: Construct a nonce by hashing the message
    // instead of generating a random number.
    // An actual random number would work just fine, and would save us
    // the trouble of hashing the message twice.  If we did that
    // however, the user could fuck it up and reuse the nonce.
    ctx->hash->init  (ctx);
    ctx->hash->update(ctx, prefix , 32);
}

void crypto_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
                                 const u8 secret_key[32],
                                 const u8 public_key[32])
{
    crypto_sign_init_first_pass_custom_hash(ctx, secret_key, public_key,
                                            &crypto_blake2b_vtable);
}

void crypto_sign_update(crypto_sign_ctx_abstract *ctx,
                        const u8 *msg, size_t msg_size)
{
    ctx->hash->update(ctx, msg, msg_size);
}

void crypto_sign_init_second_pass(crypto_sign_ctx_abstract *ctx)
{
    u8 *r        = ctx->buf + 32;
    u8 *half_sig = ctx->buf + 64;
    ctx->hash->final(ctx, r);
    reduce(r);

    // first half of the signature = "random" nonce times the base point
    ge R;
    ge_scalarmult_base(&R, r);
    ge_tobytes(half_sig, &R);
    WIPE_CTX(&R);

    // Hash R, the public key, and the message together.
    // It cannot be done in parallel with the first hash.
    ctx->hash->init  (ctx);
    ctx->hash->update(ctx, half_sig, 32);
    ctx->hash->update(ctx, ctx->pk , 32);
}

void crypto_sign_final(crypto_sign_ctx_abstract *ctx, u8 signature[64])
{
    u8 *a        = ctx->buf;
    u8 *r        = ctx->buf + 32;
    u8 *half_sig = ctx->buf + 64;
    u8  h_ram[64];
    ctx->hash->final(ctx, h_ram);
    reduce(h_ram);
    COPY(signature, half_sig, 32);
    mul_add(signature + 32, h_ram, a, r); // s = h_ram * a + r
    WIPE_BUFFER(h_ram);
    crypto_wipe(ctx, ctx->hash->ctx_size);
}

void crypto_sign(u8        signature[64],
                 const u8  secret_key[32],
                 const u8  public_key[32],
                 const u8 *message, size_t message_size)
{
    crypto_sign_ctx ctx;
    crypto_sign_ctx_abstract *actx = (crypto_sign_ctx_abstract*)&ctx;
    crypto_sign_init_first_pass (actx, secret_key, public_key);
    crypto_sign_update          (actx, message, message_size);
    crypto_sign_init_second_pass(actx);
    crypto_sign_update          (actx, message, message_size);
    crypto_sign_final           (actx, signature);
}

void crypto_check_init_custom_hash(crypto_check_ctx_abstract *ctx,
                                   const u8 signature[64],
                                   const u8 public_key[32],
                                   const crypto_sign_vtable *hash)
{
    ctx->hash = hash; // set vtable
    COPY(ctx->buf, signature , 64);
    COPY(ctx->pk , public_key, 32);
    ctx->hash->init  (ctx);
    ctx->hash->update(ctx, signature , 32);
    ctx->hash->update(ctx, public_key, 32);
}

void crypto_check_init(crypto_check_ctx_abstract *ctx, const u8 signature[64],
                       const u8 public_key[32])
{
    crypto_check_init_custom_hash(ctx, signature, public_key,
                                  &crypto_blake2b_vtable);
}

void crypto_check_update(crypto_check_ctx_abstract *ctx,
                         const u8 *msg, size_t msg_size)
{
    ctx->hash->update(ctx, msg, msg_size);
}

int crypto_check_final(crypto_check_ctx_abstract *ctx)
{
    u8 h_ram[64];
    ctx->hash->final(ctx, h_ram);
    reduce(h_ram);
    u8 *R       = ctx->buf;      // R
    u8 *s       = ctx->buf + 32; // s
    u8 *R_check = ctx->pk;       // overwrite ctx->pk to save stack space
    if (ge_r_check(R_check, s, h_ram, ctx->pk)) {
        return -1;
    }
    return crypto_verify32(R, R_check); // R == R_check ? OK : fail
}

int crypto_check(const u8  signature[64], const u8 public_key[32],
                 const u8 *message, size_t message_size)
{
    crypto_check_ctx ctx;
    crypto_check_ctx_abstract *actx = (crypto_check_ctx_abstract*)&ctx;
    crypto_check_init  (actx, signature, public_key);
    crypto_check_update(actx, message, message_size);
    return crypto_check_final(actx);
}

///////////////////////
/// EdDSA to X25519 ///
///////////////////////
void crypto_from_eddsa_private(u8 x25519[32], const u8 eddsa[32])
{
    u8 a[64];
    crypto_blake2b(a, eddsa, 32);
    COPY(x25519, a, 32);
    WIPE_BUFFER(a);
}

void crypto_from_eddsa_public(u8 x25519[32], const u8 eddsa[32])
{
    fe t1, t2;
    fe_frombytes(t2, eddsa);
    fe_add(t1, fe_one, t2);
    fe_sub(t2, fe_one, t2);
    fe_invert(t2, t2);
    fe_mul(t1, t1, t2);
    fe_tobytes(x25519, t1);
    WIPE_BUFFER(t1);
    WIPE_BUFFER(t2);
}

/////////////////////////////////////////////
/// Dirty ephemeral public key generation ///
/////////////////////////////////////////////

// Those functions generates a public key, *without* clearing the
// cofactor.  Sending that key over the network leaks 3 bits of the
// private key.  Use only to generate ephemeral keys that will be hidden
// with crypto_curve_to_hidden().
//
// The public key is otherwise compatible with crypto_x25519() and
// crypto_key_exchange() (those properly clear the cofactor).
//
// Note that the distribution of the resulting public keys is almost
// uniform.  Flipping the sign of the v coordinate (not provided by this
// function), covers the entire key space almost perfectly, where
// "almost" means a 2^-128 bias (undetectable).  This uniformity is
// needed to ensure the proper randomness of the resulting
// representatives (once we apply crypto_curve_to_hidden()).
//
// Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not
// to be confused with the prime order of the main subgroup, L, which is
// 8 times less than that).
//
// Generating all points would require us to multiply a point of order C
// (the base point plus any point of order 8) by all scalars from 0 to
// C-1.  Clamping limits us to scalars between 2^254 and 2^255 - 1. But
// by negating the resulting point at random, we also cover scalars from
// -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e).
//
// In practice:
// - Scalars from 0         to e + 1     are never generated
// - Scalars from 2^255     to 2^255 + e are never generated
// - Scalars from 2^254 + 1 to 2^254 + e are generated twice
//
// Since e < 2^128, detecting this bias requires observing over 2^100
// representatives from a given source (this will never happen), *and*
// recovering enough of the private key to determine that they do, or do
// not, belong to the biased set (this practically requires solving
// discrete logarithm, which is conjecturally intractable).
//
// In practice, this means the bias is impossible to detect.

// s + (x*L) % 8*L
// Guaranteed to fit in 256 bits iff s fits in 255 bits.
//   L             < 2^253
//   x%8           < 2^3
//   L * (x%8)     < 2^255
//   s             < 2^255
//   s + L * (x%8) < 2^256
static void add_xl(u8 s[32], u8 x)
{
    u64 mod8  = x & 7;
    u64 carry = 0;
    FOR (i , 0, 8) {
        carry = carry + load32_le(s + 4*i) + L[i] * mod8;
        store32_le(s + 4*i, (u32)carry);
        carry >>= 32;
    }
}

// "Small" dirty ephemeral key.
// Use if you need to shrink the size of the binary, and can afford to
// slow down by a factor of two (compared to the fast version)
//
// This version works by decoupling the cofactor from the main factor.
//
// - The trimmed scalar determines the main factor
// - The clamped bits of the scalar determine the cofactor.
//
// Cofactor and main factor are combined into a single scalar, which is
// then multiplied by a point of order 8*L (unlike the base point, which
// has prime order).  That "dirty" base point is the addition of the
// regular base point (9), and a point of order 8.
void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
{
    // Base point of order 8*L
    // Raw scalar multiplication with it does not clear the cofactor,
    // and the resulting public key will reveal 3 bits of the scalar.
    static const u8 dirty_base_point[32] = {
        0x34, 0xfc, 0x6c, 0xb7, 0xc8, 0xde, 0x58, 0x97, 0x77, 0x70, 0xd9, 0x52,
        0x16, 0xcc, 0xdc, 0x6c, 0x85, 0x90, 0xbe, 0xcd, 0x91, 0x9c, 0x07, 0x59,
        0x94, 0x14, 0x56, 0x3b, 0x4b, 0xa4, 0x47, 0x0f, };
    // separate the main factor & the cofactor of the scalar
    u8 scalar[32];
    COPY(scalar, secret_key, 32);
    trim_scalar(scalar);

    // Separate the main factor and the cofactor
    //
    // The scalar is trimmed, so its cofactor is cleared.  The three
    // least significant bits however still have a main factor.  We must
    // remove it for X25519 compatibility.
    //
    // We exploit the fact that 5*L = 1 (modulo 8)
    //   cofactor = lsb * 5 * L             (modulo 8*L)
    //   combined = scalar + cofactor       (modulo 8*L)
    //   combined = scalar + (lsb * 5 * L)  (modulo 8*L)
    add_xl(scalar, secret_key[0] * 5);
    scalarmult(public_key, scalar, dirty_base_point, 256);
    WIPE_BUFFER(scalar);
}

// "Fast" dirty ephemeral key
// We use this one by default.
//
// This version works by performing a regular scalar multiplication,
// then add a low order point.  The scalar multiplication is done in
// Edwards space for more speed (*2 compared to the "small" version).
// The cost is a bigger binary for programs that don't also sign messages.
void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
{
    u8 scalar[32];
    ge pk;
    COPY(scalar, secret_key, 32);
    trim_scalar(scalar);
    ge_scalarmult_base(&pk, scalar);

    // Select low order point
    // We're computing the [cofactor]lop scalar multiplication, where:
    //   cofactor = tweak & 7.
    //   lop      = (lop_x, lop_y)
    //   lop_x    = sqrt((sqrt(d + 1) + 1) / d)
    //   lop_y    = -lop_x * sqrtm1
    // Notes:
    // - A (single) Montgomery ladder would be twice as slow.
    // - An actual scalar multiplication would hurt performance.
    // - A full table lookup would take more code.
    u8 cofactor = secret_key[0] & 7;
    int a = (cofactor >> 2) & 1;
    int b = (cofactor >> 1) & 1;
    int c = (cofactor >> 0) & 1;
    fe t1, t2, t3;
    fe_0(t1);
    fe_ccopy(t1, sqrtm1, b);
    fe_ccopy(t1, lop_x , c);
    fe_neg  (t3, t1);
    fe_ccopy(t1, t3, a);
    fe_1(t2);
    fe_0(t3);
    fe_ccopy(t2, t3   , b);
    fe_ccopy(t2, lop_y, c);
    fe_neg  (t3, t2);
    fe_ccopy(t2, t3, a^b);
    ge_precomp low_order_point;
    fe_add(low_order_point.Yp, t2, t1);
    fe_sub(low_order_point.Ym, t2, t1);
    fe_mul(low_order_point.T2, t2, t1);
    fe_mul(low_order_point.T2, low_order_point.T2, D2);

    // Add low order point to the public key
    ge_madd(&pk, &pk, &low_order_point, t1, t2);

    // Convert to Montgomery u coordinate (we ignore the sign)
    fe_add(t1, pk.Z, pk.Y);
    fe_sub(t2, pk.Z, pk.Y);
    fe_invert(t2, t2);
    fe_mul(t1, t1, t2);

    fe_tobytes(public_key, t1);

    WIPE_BUFFER(t1);  WIPE_BUFFER(scalar);
    WIPE_BUFFER(t2);  WIPE_CTX(&pk);
    WIPE_BUFFER(t3);  WIPE_CTX(&low_order_point);
}

///////////////////
/// Elligator 2 ///
///////////////////
static const fe A = {486662};

// Elligator direct map
//
// Computes the point corresponding to a representative, encoded in 32
// bytes (little Endian).  Since positive representatives fits in 254
// bits, The two most significant bits are ignored.
//
// From the paper:
// w = -A / (fe(1) + non_square * r^2)
// e = chi(w^3 + A*w^2 + w)
// u = e*w - (fe(1)-e)*(A//2)
// v = -e * sqrt(u^3 + A*u^2 + u)
//
// We ignore v because we don't need it for X25519 (the Montgomery
// ladder only uses u).
//
// Note that e is either 0, 1 or -1
// if e = 0    u = 0  and v = 0
// if e = 1    u = w
// if e = -1   u = -w - A = w * non_square * r^2
//
// Let r1 = non_square * r^2
// Let r2 = 1 + r1
// Note that r2 cannot be zero, -1/non_square is not a square.
// We can (tediously) verify that:
//   w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3
// Therefore:
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3))
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6)
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)  *     r2^6)
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) *  A * r2^3)
// Corollary:
//   e =  1 if (A^2*r1 - r2^2) *  A * r2^3) is a non-zero square
//   e = -1 if (A^2*r1 - r2^2) *  A * r2^3) is not a square
//   Note that w^3 + A*w^2 + w (and therefore e) can never be zero:
//     w^3 + A*w^2 + w = w * (w^2 + A*w + 1)
//     w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1)
//     w^3 + A*w^2 + w = w * (w + A/2)^2        - A^2/4 + 1)
//     which is zero only if:
//       w = 0                   (impossible)
//       (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square)
//
// Let isr   = invsqrt((A^2*r1 - r2^2) *  A * r2^3)
//     isr   = sqrt(1        / ((A^2*r1 - r2^2) *  A * r2^3)) if e =  1
//     isr   = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) *  A * r2^3)) if e = -1
//
// if e = 1
//   let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2
//       u1 = w
//       u1 = u
//
// if e = -1
//   let ufactor = -non_square * sqrt(-1) * r^2
//   let vfactor = sqrt(ufactor)
//   let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor
//       u2 = w * -1 * -non_square * r^2
//       u2 = w * non_square * r^2
//       u2 = u
void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
{
    // Representatives are encoded in 254 bits.
    // The two most significant ones are random padding that must be ignored.
    u8 clamped[32];
    COPY(clamped, hidden, 32);
    clamped[31] &= 0x3f;

    fe r, u, t1, t2, t3;
    fe_frombytes(r, clamped);
    fe_sq2(t1, r);
    fe_add(u, t1, fe_one);
    fe_sq (t2, u);
    fe_mul(t3, A2, t1);
    fe_sub(t3, t3, t2);
    fe_mul(t3, t3, A);
    fe_mul(t1, t2, u);
    fe_mul(t1, t3, t1);
    int is_square = invsqrt(t1, t1);
    fe_sq(u, r);
    fe_mul(u, u, ufactor);
    fe_ccopy(u, fe_one, is_square);
    fe_sq (t1, t1);
    fe_mul(u, u, A);
    fe_mul(u, u, t3);
    fe_mul(u, u, t2);
    fe_mul(u, u, t1);
    fe_neg(u, u);
    fe_tobytes(curve, u);

    WIPE_BUFFER(t1);  WIPE_BUFFER(r);
    WIPE_BUFFER(t2);  WIPE_BUFFER(u);
    WIPE_BUFFER(t3);  WIPE_BUFFER(clamped);
}

// Elligator inverse map
//
// Computes the representative of a point, if possible.  If not, it does
// nothing and returns -1.  Note that the success of the operation
// depends only on the point (more precisely its u coordinate).  The
// tweak parameter is used only upon success
//
// The tweak should be a random byte.  Beyond that, its contents are an
// implementation detail. Currently, the tweak comprises:
// - Bit  1  : sign of the v coordinate (0 if positive, 1 if negative)
// - Bit  2-5: not used
// - Bits 6-7: random padding
//
// From the paper:
// Let sq = -non_square * u * (u+A)
// if sq is not a square, or u = -A, there is no mapping
// Assuming there is a mapping:
//   if v is positive: r = sqrt(-(u+A) / u)
//   if v is negative: r = sqrt(-u / (u+A))
//
// We compute isr = invsqrt(-non_square * u * (u+A))
// if it wasn't a non-zero square, abort.
// else, isr = sqrt(-1 / (non_square * u * (u+A))
//
// This causes us to abort if u is zero, even though we shouldn't. This
// never happens in practice, because (i) a random point in the curve has
// a negligible chance of being zero, and (ii) scalar multiplication with
// a trimmed scalar *never* yields zero.
//
// Since:
//   isr * (u+A) = sqrt(-1     / (non_square * u * (u+A)) * (u+A)
//   isr * (u+A) = sqrt(-(u+A) / (non_square * u * (u+A))
// and:
//   isr = u = sqrt(-1 / (non_square * u * (u+A)) * u
//   isr = u = sqrt(-u / (non_square * u * (u+A))
// Therefore:
//   if v is positive: r = isr * (u+A)
//   if v is negative: r = isr * u
int crypto_curve_to_hidden(u8 hidden[32], const u8 public_key[32], u8 tweak)
{
    fe t1, t2, t3;
    fe_frombytes(t1, public_key);

    fe_add(t2, t1, A);
    fe_mul(t3, t1, t2);
    fe_mul_small(t3, t3, -2);
    int is_square = invsqrt(t3, t3);
    if (!is_square) {
        // The only variable time bit.  This ultimately reveals how many
        // tries it took us to find a representable key.
        // This does not affect security as long as we try keys at random.
        WIPE_BUFFER(t1);
        WIPE_BUFFER(t2);
        WIPE_BUFFER(t3);
        return -1;
    }
    fe_ccopy    (t1, t2, tweak & 1);
    fe_mul      (t3, t1, t3);
    fe_mul_small(t1, t3, 2);
    fe_neg      (t2, t3);
    fe_ccopy    (t3, t2, fe_isodd(t1));
    fe_tobytes(hidden, t3);

    // Pad with two random bits
    hidden[31] |= tweak & 0xc0;

    WIPE_BUFFER(t1);
    WIPE_BUFFER(t2);
    WIPE_BUFFER(t3);
    return 0;
}

void crypto_hidden_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
{
    u8 pk [32]; // public key
    u8 buf[64]; // seed + representative
    COPY(buf + 32, seed, 32);
    do {
        crypto_chacha20(buf, 0, 64, buf+32, zero);
        crypto_x25519_dirty_fast(pk, buf); // or the "small" version
    } while(crypto_curve_to_hidden(buf+32, pk, buf[32]));
    // Note that the return value of crypto_curve_to_hidden() is
    // independent from its tweak parameter.
    // Therefore, buf[32] is not actually reused.  Either we loop one
    // more time and buf[32] is used for the new seed, or we succeeded,
    // and buf[32] becomes the tweak parameter.

    crypto_wipe(seed, 32);
    COPY(hidden    , buf + 32, 32);
    COPY(secret_key, buf     , 32);
    WIPE_BUFFER(buf);
    WIPE_BUFFER(pk);
}

////////////////////
/// Key exchange ///
////////////////////
void crypto_key_exchange(u8       shared_key[32],
                         const u8 your_secret_key [32],
                         const u8 their_public_key[32])
{
    crypto_x25519(shared_key, your_secret_key, their_public_key);
    crypto_hchacha20(shared_key, shared_key, zero);
}

///////////////////////
/// Scalar division ///
///////////////////////

// Montgomery reduction.
// Divides x by (2^256), and reduces the result modulo L
//
// Precondition:
//   x < L * 2^256
// Constants:
//   r = 2^256                 (makes division by r trivial)
//   k = (r * (1/r) - 1) // L  (1/r is computed modulo L   )
// Algorithm:
//   s = (x * k) % r
//   t = x + s*L      (t is always a multiple of r)
//   u = (t/r) % L    (u is always below 2*L, conditional subtraction is enough)
static void redc(u32 u[8], u32 x[16])
{
    static const u32 k[8]  = { 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
                               0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,};
    static const u32 l[8]  = { 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
                               0x00000000, 0x00000000, 0x00000000, 0x10000000,};
    // s = x * k (modulo 2^256)
    // This is cheaper than the full multiplication.
    u32 s[8] = {0};
    FOR (i, 0, 8) {
        u64 carry = 0;
        FOR (j, 0, 8-i) {
            carry  += s[i+j] + (u64)x[i] * k[j];
            s[i+j]  = (u32)carry;
            carry >>= 32;
        }
    }
    u32 t[16] = {0};
    multiply(t, s, l);

    // t = t + x
    u64 carry = 0;
    FOR (i, 0, 16) {
        carry  += (u64)t[i] + x[i];
        t[i]    = (u32)carry;
        carry >>= 32;
    }

    // u = (t / 2^256) % L
    // Note that t / 2^256 is always below 2*L,
    // So a constant time conditional subtraction is enough
    // We work with L directly, in a 2's complement encoding
    // (-L == ~L + 1)
    remove_l(u, t+8);

    WIPE_BUFFER(s);
    WIPE_BUFFER(t);
}

void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
                           const u8 curve_point[32])
{
    static const  u8 Lm2[32] = { // L - 2
        0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2,
        0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, };
    // 1 in Montgomery form
    u32 m_inv [8] = {0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4,
                     0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,};

    u8 scalar[32];
    COPY(scalar, private_key, 32);
    trim_scalar(scalar);

    // Convert the scalar in Montgomery form
    // m_scl = scalar * 2^256 (modulo L)
    u32 m_scl[8];
    {
        u32 tmp[16];
        ZERO(tmp, 8);
        load32_le_buf(tmp+8, scalar, 8);
        mod_l(scalar, tmp);
        load32_le_buf(m_scl, scalar, 8);
        WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
    }

    u32 product[16];
    for (int i = 252; i >= 0; i--) {
        ZERO(product, 16);
        multiply(product, m_inv, m_inv);
        redc(m_inv, product);
        if (scalar_bit(Lm2, i)) {
            ZERO(product, 16);
            multiply(product, m_inv, m_scl);
            redc(m_inv, product);
        }
    }
    // Convert the inverse *out* of Montgomery form
    // scalar = m_inv / 2^256 (modulo L)
    COPY(product, m_inv, 8);
    ZERO(product + 8, 8);
    redc(m_inv, product);
    store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar

    // Clear the cofactor of scalar:
    //   cleared = scalar * (3*L + 1)      (modulo 8*L)
    //   cleared = scalar + scalar * 3 * L (modulo 8*L)
    // Note that (scalar * 3) is reduced modulo 8, so we only need the
    // first byte.
    add_xl(scalar, scalar[0] * 3);

    // Recall that 8*L < 2^256. However it is also very close to
    // 2^255. If we spanned the ladder over 255 bits, random tests
    // wouldn't catch the off-by-one error.
    scalarmult(blind_salt, scalar, curve_point, 256);

    WIPE_BUFFER(scalar);   WIPE_BUFFER(m_scl);
    WIPE_BUFFER(product);  WIPE_BUFFER(m_inv);
}

////////////////////////////////
/// Authenticated encryption ///
////////////////////////////////
static void lock_auth(u8 mac[16], const u8  auth_key[32],
                      const u8 *ad         , size_t ad_size,
                      const u8 *cipher_text, size_t text_size)
{
    u8 sizes[16]; // Not secret, not wiped
    store64_le(sizes + 0, ad_size);
    store64_le(sizes + 8, text_size);
    crypto_poly1305_ctx poly_ctx;           // auto wiped...
    crypto_poly1305_init  (&poly_ctx, auth_key);
    crypto_poly1305_update(&poly_ctx, ad         , ad_size);
    crypto_poly1305_update(&poly_ctx, zero       , align(ad_size, 16));
    crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
    crypto_poly1305_update(&poly_ctx, zero       , align(text_size, 16));
    crypto_poly1305_update(&poly_ctx, sizes      , 16);
    crypto_poly1305_final (&poly_ctx, mac); // ...here
}

void crypto_lock_aead(u8 mac[16], u8 *cipher_text,
                      const u8  key[32], const u8  nonce[24],
                      const u8 *ad        , size_t ad_size,
                      const u8 *plain_text, size_t text_size)
{
    u8 sub_key[32];
    u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
    crypto_hchacha20(sub_key, key, nonce);