diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..2d46afe
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,34 @@
+libuecc v7 (2016/03/27)
+
+* Change conversion between Ed25519 and legacy representation. This should
+    not affect any operations unless Ed25519 and legacy load/store
+    functions are mixed when accessing a work structure. Doing so is now
+    officially supported, for example to convert a legacy public key to
+    Ed25519 format.
+* The changed representation allows to use the same
+    ecc_25519_work_default_base for both Ed25519 and legacy.
+    ecc_25519_work_default_base and ecc_25519_scalarmult_base have been
+    undeprecated, ecc_25519_work_base_ed25519 and
+    ecc_25519_work_base_legacy are deprecated now.
+* All points are now internally represented with Ed25519 coordinates, which
+    allows about 6% faster scalar multplication than the legacy
+    representation.
+* ecc_25519_scalarmult_base has been further optimized, making it another
+    6% faster than normal ecc_25519_scalarmult.
+
+
+libuecc v6 (2015/10/25)
+
+* Fixes a bug which might have caused a point's y coordinate to be negated
+    in certain circumstances when the point was stored in packed
+    representation and loaded again. It is extremely improbable that this
+    has ever actually happened, as only a small range of coordinates was
+    affected.
+* Use stdint types to clarify ABI and add support for systems with
+    sizeof(int) < 4 (this is not an ABI break in practise as all systems on
+    which libuecc has been used in the past should have int == int32_t)
+* Add point negation and subtraction functions
+* Rename all point access functions to bear a _legacy suffix (the old names
+    are still available, but marked as deprecated)
+* Add new point access functions and a new generator point that are
+    compatible with Ed25519
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cae20ed..779ac41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 2.6)
 project(LIBUECC C)
-set(PROJECT_VERSION 5)
+set(PROJECT_VERSION 7)
 
 set(CMAKE_MODULE_PATH ${LIBUECC_SOURCE_DIR})
 
diff --git a/README b/README
new file mode 100644
index 0000000..6e0ec9b
--- /dev/null
+++ b/README
@@ -0,0 +1,30 @@
+libuecc is a very small generic-purpose Elliptic Curve Cryptography library
+compatible with Ed25519.
+
+Most documentation can be found as Doxygen comments in the ecc.h header
+file. You can use `make doxygen` after running CMake to create HTML
+documenation from it.
+
+There are two sets of functions converting between libuecc's internal point
+representation and coordinates or compressed representation. The functions
+ending with _ed25519 use the same representation as original Ed25519
+implementation and should be used by new software. The functions with the
+suffix _legacy are provided for compatiblity with libuecc version before
+v6.
+
+Ed25519 and the legacy representation are isomorphic, they use a Twisted
+Edwards Curve
+
+    ax^2 + y^2 = 1 + dx^2y^2
+
+over the prime field for p = 2^255 - 19.
+
+Ed25519 uses the parameters
+
+    a = -1 and
+    d = -(121665/121666),
+
+while the legacy curve has
+
+    a = 486664
+    d = 486660.
diff --git a/include/libuecc/ecc.h b/include/libuecc/ecc.h
index 4f6b870..1fb6106 100644
--- a/include/libuecc/ecc.h
+++ b/include/libuecc/ecc.h
@@ -27,6 +27,14 @@
 #ifndef _LIBUECC_ECC_H_
 #define _LIBUECC_ECC_H_
 
+#ifndef DEPRECATED
+#define DEPRECATED __attribute__((deprecated))
+#endif
+
+
+#include <stdint.h>
+
+
 /**
  * A 256 bit integer
  *
@@ -34,7 +42,7 @@
  */
 typedef union _ecc_int256 {
 	/** Data bytes */
-	unsigned char p[32];
+	uint8_t p[32];
 } ecc_int256_t;
 
 /**
@@ -44,10 +52,10 @@ typedef union _ecc_int256 {
  * it should always be packed.
  */
 typedef struct _ecc_25519_work {
-	unsigned int X[32];
-	unsigned int Y[32];
-	unsigned int Z[32];
-	unsigned int T[32];
+	uint32_t X[32];
+	uint32_t Y[32];
+	uint32_t Z[32];
+	uint32_t T[32];
 } ecc_25519_work_t;
 
 /**
@@ -55,22 +63,205 @@ typedef struct _ecc_25519_work {
  * @{
  */
 
+/** The identity element */
 extern const ecc_25519_work_t ecc_25519_work_identity;
+
+
+/**
+ * The Ed25519 default generator point
+ *
+ * \deprecated Use the equivalent \ref ecc_25519_work_default_base instead.
+ *
+ **/
+DEPRECATED extern const ecc_25519_work_t ecc_25519_work_base_ed25519;
+
+/**
+ * The Ed25519 default generator point
+ *
+ * \deprecated Use the equivalent \ref ecc_25519_work_default_base instead.
+ */
+DEPRECATED extern const ecc_25519_work_t ecc_25519_work_base_legacy;
+
+
+/**
+ * The Ed25519 default generator point
+ *
+ * The order of the base point is \f$ 2^{252} + 27742317777372353535851937790883648493 \f$.
+ */
 extern const ecc_25519_work_t ecc_25519_work_default_base;
 
-int ecc_25519_load_xy(ecc_25519_work_t *out, const ecc_int256_t *x, const ecc_int256_t *y);
-void ecc_25519_store_xy(ecc_int256_t *x, ecc_int256_t *y, const ecc_25519_work_t *in);
 
-int ecc_25519_load_packed(ecc_25519_work_t *out, const ecc_int256_t *in);
-void ecc_25519_store_packed(ecc_int256_t *out, const ecc_25519_work_t *in);
+/** Loads a point of the Ed25519 curve with given coordinates into its unpacked representation */
+int ecc_25519_load_xy_ed25519(ecc_25519_work_t *out, const ecc_int256_t *x, const ecc_int256_t *y);
 
+/**
+ * Loads a point of the legacy curve with given coordinates into its unpacked representation
+ *
+ * New software should use \ref ecc_25519_load_xy_ed25519, which uses the same curve as the Ed25519 algorithm.
+ */
+int ecc_25519_load_xy_legacy(ecc_25519_work_t *out, const ecc_int256_t *x, const ecc_int256_t *y);
+
+/**
+ * Loads a point of the legacy curve with given coordinates into its unpacked representation
+ *
+ * \deprecated Use \ref ecc_25519_load_xy_legacy
+ */
+DEPRECATED int ecc_25519_load_xy(ecc_25519_work_t *out, const ecc_int256_t *x, const ecc_int256_t *y);
+
+
+/**
+ * Stores the x and y coordinates of a point of the Ed25519 curve
+ *
+ * \param x Returns the x coordinate of the point. May be NULL.
+ * \param y Returns the y coordinate of the point. May be NULL.
+ * \param in The unpacked point to store.
+ */
+void ecc_25519_store_xy_ed25519(ecc_int256_t *x, ecc_int256_t *y, const ecc_25519_work_t *in);
+
+/**
+ * Stores the x and y coordinates of a point of the legacy curve
+ *
+ * New software should use \ref ecc_25519_store_xy_ed25519, which uses the same curve as the Ed25519 algorithm.
+ *
+ * \param x Returns the x coordinate of the point. May be NULL.
+ * \param y Returns the y coordinate of the point. May be NULL.
+ * \param in The unpacked point to store.
+ */
+void ecc_25519_store_xy_legacy(ecc_int256_t *x, ecc_int256_t *y, const ecc_25519_work_t *in);
+
+/**
+ * Stores a point's x and y coordinates
+ *
+ * \param x Returns the x coordinate of the point. May be NULL.
+ * \param y Returns the y coordinate of the point. May be NULL.
+ * \param in The unpacked point to store.
+ *
+ * \deprecated Use \ref ecc_25519_store_xy_legacy
+ */
+DEPRECATED void ecc_25519_store_xy(ecc_int256_t *x, ecc_int256_t *y, const ecc_25519_work_t *in);
+
+
+/**
+ * Loads a packed point of the Ed25519 curve into its unpacked representation
+ *
+ * The packed format is different from the legacy one: the legacy format contains that X coordinate and the parity of the Y coordinate,
+ * Ed25519 uses the Y coordinate and the parity of the X coordinate.
+*/
+int ecc_25519_load_packed_ed25519(ecc_25519_work_t *out, const ecc_int256_t *in);
+
+/**
+ * Loads a packed point of the legacy curve into its unpacked representation
+ *
+ * New software should use \ref ecc_25519_load_packed_ed25519, which uses the same curve and packed representation as the Ed25519 algorithm.
+ *
+ * The packed format is different from the Ed25519 one: the legacy format contains that X coordinate and the parity of the Y coordinate,
+ * Ed25519 uses the Y coordinate and the parity of the X coordinate.
+ */
+int ecc_25519_load_packed_legacy(ecc_25519_work_t *out, const ecc_int256_t *in);
+
+/**
+ * Loads a packed point of the legacy curve into its unpacked representation
+ *
+ * \deprecated Use \ref ecc_25519_load_packed_legacy
+ */
+DEPRECATED int ecc_25519_load_packed(ecc_25519_work_t *out, const ecc_int256_t *in);
+
+
+/**
+ * Stores a point of the Ed25519 curve into its packed representation
+ *
+ * The packed format is different from the Ed25519 one: the legacy format contains that X coordinate and the parity of the Y coordinate,
+ * Ed25519 uses the Y coordinate and the parity of the X coordinate.
+ */
+void ecc_25519_store_packed_ed25519(ecc_int256_t *out, const ecc_25519_work_t *in);
+
+/**
+ * Stores a point of the legacy curve into its packed representation
+ *
+ * New software should use \ref ecc_25519_store_packed_ed25519, which uses the same curve and packed representation as the Ed25519 algorithm.
+ *
+ * The packed format is different from the Ed25519 one: the legacy format contains that X coordinate and the parity of the Y coordinate,
+ * Ed25519 uses the Y coordinate and the parity of the X coordinate.
+ */
+void ecc_25519_store_packed_legacy(ecc_int256_t *out, const ecc_25519_work_t *in);
+
+/**
+ * Stores a point of the legacy curve into its packed representation
+ *
+ * \deprecated Use \ref ecc_25519_store_packed_legacy
+ */
+DEPRECATED void ecc_25519_store_packed(ecc_int256_t *out, const ecc_25519_work_t *in);
+
+
+/** Checks if a point is the identity element of the Elliptic Curve group */
 int ecc_25519_is_identity(const ecc_25519_work_t *in);
+
+/**
+ * Negates a point of the Elliptic Curve
+ *
+ * The same pointer may be given for input and output
+ */
+void ecc_25519_negate(ecc_25519_work_t *out, const ecc_25519_work_t *in);
+
+/**
+ * Doubles a point of the Elliptic Curve
+ *
+ * ecc_25519_double(out, in) is equivalent to ecc_25519_add(out, in, in), but faster.
+ *
+ * The same pointer may be given for input and output.
+ */
 void ecc_25519_double(ecc_25519_work_t *out, const ecc_25519_work_t *in);
+
+/**
+ * Adds two points of the Elliptic Curve
+ *
+ * The same pointers may be given for input and output.
+ */
 void ecc_25519_add(ecc_25519_work_t *out, const ecc_25519_work_t *in1, const ecc_25519_work_t *in2);
 
+/**
+ * Subtracts two points of the Elliptic Curve
+ *
+ * The same pointers may be given for input and output.
+ */
+void ecc_25519_sub(ecc_25519_work_t *out, const ecc_25519_work_t *in1, const ecc_25519_work_t *in2);
+
+/**
+ * Does a scalar multiplication of a point of the Elliptic Curve with an integer of a given bit length
+ *
+ * To speed up scalar multiplication when it is known that not the whole 256 bits of the scalar
+ * are used. The bit length should always be a constant and not computed at runtime to ensure
+ * that no timing attacks are possible.
+ *
+ * The same pointer may be given for input and output.
+ **/
 void ecc_25519_scalarmult_bits(ecc_25519_work_t *out, const ecc_int256_t *n, const ecc_25519_work_t *base, unsigned bits);
+
+/**
+ * Does a scalar multiplication of a point of the Elliptic Curve with an integer
+ *
+ * The same pointer may be given for input and output.
+ **/
 void ecc_25519_scalarmult(ecc_25519_work_t *out, const ecc_int256_t *n, const ecc_25519_work_t *base);
+
+/**
+ * Does a scalar multiplication of the default base point (generator element) of the Elliptic Curve with an integer of a given bit length
+ *
+ * The order of the base point is \f$ 2^{252} + 27742317777372353535851937790883648493 \f$.
+ *
+ * ecc_25519_scalarmult_base_bits(out, n, bits) is faster than ecc_25519_scalarmult_bits(out, n, &ecc_25519_work_default_base, bits).
+ *
+ * See the notes about \ref ecc_25519_scalarmult_bits before using this function.
+ */
 void ecc_25519_scalarmult_base_bits(ecc_25519_work_t *out, const ecc_int256_t *n, unsigned bits);
+
+/**
+ * Does a scalar multiplication of the default base point (generator element) of the Elliptic Curve with an integer
+ *
+ * The order of the base point is \f$ 2^{252} + 27742317777372353535851937790883648493 \f$.
+ *
+ * ecc_25519_scalarmult_base(out, n) is faster than ecc_25519_scalarmult(out, n, &ecc_25519_work_default_base).
+ */
 void ecc_25519_scalarmult_base(ecc_25519_work_t *out, const ecc_int256_t *n);
 
 /**@}*/
@@ -80,14 +271,61 @@ void ecc_25519_scalarmult_base(ecc_25519_work_t *out, const ecc_int256_t *n);
  * @{
  */
 
+/**
+ * The order of the prime field
+ *
+ * The order is \f$ 2^{252} + 27742317777372353535851937790883648493 \f$.
+ */
 extern const ecc_int256_t ecc_25519_gf_order;
 
+
+/** Checks if an integer is equal to zero (after reduction) */
 int ecc_25519_gf_is_zero(const ecc_int256_t *in);
+
+/**
+ * Adds two integers as Galois field elements
+ *
+ * The same pointers may be given for input and output.
+ */
 void ecc_25519_gf_add(ecc_int256_t *out, const ecc_int256_t *in1, const ecc_int256_t *in2);
+
+/**
+ * Subtracts two integers as Galois field elements
+ *
+ * The same pointers may be given for input and output.
+ */
 void ecc_25519_gf_sub(ecc_int256_t *out, const ecc_int256_t *in1, const ecc_int256_t *in2);
+
+/**
+ * Reduces an integer to a unique representation in the range \f$ [0,q-1] \f$
+ *
+ * The same pointer may be given for input and output.
+ */
 void ecc_25519_gf_reduce(ecc_int256_t *out, const ecc_int256_t *in);
+
+/**
+ * Multiplies two integers as Galois field elements
+ *
+ * The same pointers may be given for input and output.
+ */
 void ecc_25519_gf_mult(ecc_int256_t *out, const ecc_int256_t *in1, const ecc_int256_t *in2);
+
+/**
+ * Computes the reciprocal of a Galois field element
+ *
+ * The same pointers may be given for input and output.
+ */
 void ecc_25519_gf_recip(ecc_int256_t *out, const ecc_int256_t *in);
+
+/**
+ * Ensures some properties of a Galois field element to make it fit for use as a secret key
+ *
+ * This sets the 255th bit and clears the 256th and the bottom three bits (so the key
+ * will be a multiple of 8). See Daniel J. Bernsteins paper "Curve25519: new Diffie-Hellman speed records."
+ * for the rationale of this.
+ *
+ * The same pointer may be given for input and output.
+ */
 void ecc_25519_gf_sanitize_secret(ecc_int256_t *out, const ecc_int256_t *in);
 
 /**@}*/
diff --git a/src/ec25519.c b/src/ec25519.c
index 736b798..0ed3741 100644
--- a/src/ec25519.c
+++ b/src/ec25519.c
@@ -25,156 +25,324 @@
 */
 
 /** \file
- * EC group operations for Twisted Edwards Curve \f$ ax^2 + y^2 = 1 + dx^2y^2 \f$ with
- *    \f$ a = 486664 \f$ and
- *    \f$ d = 486660 \f$
+ * EC group operations for Twisted Edwards Curve \f$ ax^2 + y^2 = 1 + dx^2y^2 \f$
  * on prime field \f$ p = 2^{255} - 19 \f$.
  *
- * The curve is equivalent to the Montgomery Curve used in D. J. Bernstein's
+ * Two different (isomorphic) sets of curve parameters are supported:
+ *
+ *    \f$ a = 486664 \f$ and
+ *    \f$ d = 486660 \f$
+ * are the parameters used by the original libuecc implementation (till v5).
+ * To use points on this curve, use the functions with the suffix \em legacy.
+ *
+ * The other supported curve uses the parameters
+ *    \f$ a = -1 \f$ and
+ *    \f$ d = -(121665/121666) \f$,
+ * which is the curve used by the Ed25519 algorithm. The functions for this curve
+ * have the suffix \em ed25519.
+ *
+ * Internally, libuecc always uses the latter representation for its \em work structure.
+ *
+ * The curves are equivalent to the Montgomery Curve used in D. J. Bernstein's
  * Curve25519 Diffie-Hellman algorithm.
  *
  * See http://hyperelliptic.org/EFD/g1p/auto-twisted-extended.html for add and
  * double operations.
+ *
+ * Doxygen comments for public APIs can be found in the public header file.
+ *
+ * Invariant that must be held by all public API: the components of an
+ * \ref ecc_25519_work_t are always in the range \f$ [0, 2p) \f$.
+ * Integers in this range will be called \em squeezed in the following.
  */
 
 #include <libuecc/ecc.h>
 
 
-/** The identity element */
 const ecc_25519_work_t ecc_25519_work_identity = {{0}, {1}, {1}, {0}};
 
-
-/** The ec25519 default base */
-const ecc_25519_work_t ecc_25519_work_default_base = {
-	{0xd4, 0x6b, 0xfe, 0x7f, 0x39, 0xfa, 0x8c, 0x22,
-	 0xe1, 0x96, 0x23, 0xeb, 0x26, 0xb7, 0x8e, 0x6a,
-	 0x34, 0x74, 0x8b, 0x66, 0xd6, 0xa3, 0x26, 0xdd,
-	 0x19, 0x5e, 0x9f, 0x21, 0x50, 0x43, 0x7c, 0x54},
+const ecc_25519_work_t ecc_25519_work_base_legacy = {
+	{0x1a, 0xd5, 0x25, 0x8f, 0x60, 0x2d, 0x56, 0xc9,
+	 0xb2, 0xa7, 0x25, 0x95, 0x60, 0xc7, 0x2c, 0x69,
+	 0x5c, 0xdc, 0xd6, 0xfd, 0x31, 0xe2, 0xa4, 0xc0,
+	 0xfe, 0x53, 0x6e, 0xcd, 0xd3, 0x36, 0x69, 0x21},
 	{0x58, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
 	 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
 	 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
 	 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66},
 	{1},
-	{0x47, 0x56, 0x98, 0x99, 0xc7, 0x61, 0x0a, 0x82,
-	 0x1a, 0xdf, 0x82, 0x22, 0x1f, 0x2c, 0x72, 0x88,
-	 0xc3, 0x29, 0x09, 0x52, 0x78, 0xe9, 0x1e, 0xe4,
-	 0x47, 0x4b, 0x4c, 0x81, 0xa6, 0x02, 0xfd, 0x29}
+	{0xa3, 0xdd, 0xb7, 0xa5, 0xb3, 0x8a, 0xde, 0x6d,
+	 0xf5, 0x52, 0x51, 0x77, 0x80, 0x9f, 0xf0, 0x20,
+	 0x7d, 0xe3, 0xab, 0x64, 0x8e, 0x4e, 0xea, 0x66,
+	 0x65, 0x76, 0x8b, 0xd7, 0x0f, 0x5f, 0x87, 0x67},
+};
+
+const ecc_25519_work_t ecc_25519_work_default_base = {
+	{0x1a, 0xd5, 0x25, 0x8f, 0x60, 0x2d, 0x56, 0xc9,
+	 0xb2, 0xa7, 0x25, 0x95, 0x60, 0xc7, 0x2c, 0x69,
+	 0x5c, 0xdc, 0xd6, 0xfd, 0x31, 0xe2, 0xa4, 0xc0,
+	 0xfe, 0x53, 0x6e, 0xcd, 0xd3, 0x36, 0x69, 0x21},
+	{0x58, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+	 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+	 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+	 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66},
+	{1},
+	{0xa3, 0xdd, 0xb7, 0xa5, 0xb3, 0x8a, 0xde, 0x6d,
+	 0xf5, 0x52, 0x51, 0x77, 0x80, 0x9f, 0xf0, 0x20,
+	 0x7d, 0xe3, 0xab, 0x64, 0x8e, 0x4e, 0xea, 0x66,
+	 0x65, 0x76, 0x8b, 0xd7, 0x0f, 0x5f, 0x87, 0x67},
 };
 
 
-static const unsigned int zero[32] = {0};
-static const unsigned int one[32] = {1};
+const ecc_25519_work_t ecc_25519_work_base_ed25519 = {
+	{0x1a, 0xd5, 0x25, 0x8f, 0x60, 0x2d, 0x56, 0xc9,
+	 0xb2, 0xa7, 0x25, 0x95, 0x60, 0xc7, 0x2c, 0x69,
+	 0x5c, 0xdc, 0xd6, 0xfd, 0x31, 0xe2, 0xa4, 0xc0,
+	 0xfe, 0x53, 0x6e, 0xcd, 0xd3, 0x36, 0x69, 0x21},
+	{0x58, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+	 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+	 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+	 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66},
+	{1},
+	{0xa3, 0xdd, 0xb7, 0xa5, 0xb3, 0x8a, 0xde, 0x6d,
+	 0xf5, 0x52, 0x51, 0x77, 0x80, 0x9f, 0xf0, 0x20,
+	 0x7d, 0xe3, 0xab, 0x64, 0x8e, 0x4e, 0xea, 0x66,
+	 0x65, 0x76, 0x8b, 0xd7, 0x0f, 0x5f, 0x87, 0x67},
+};
+
+
+static const uint32_t zero[32] = {0};
+static const uint32_t one[32] = {1};
+
+static const uint32_t minus1[32] = {
+	0xec, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f,
+};
+
+/** Ed25519 parameter -(121665/121666) */
+static const uint32_t d[32] = {
+	0xa3, 0x78, 0x59, 0x13, 0xca, 0x4d, 0xeb, 0x75,
+	0xab, 0xd8, 0x41, 0x41, 0x4d, 0x0a, 0x70, 0x00,
+	0x98, 0xe8, 0x79, 0x77, 0x79, 0x40, 0xc7, 0x8c,
+	0x73, 0xfe, 0x6f, 0x2b, 0xee, 0x6c, 0x03, 0x52,
+};
+
+
+/** Factor to multiply the X coordinate with to convert from the legacy to the Ed25519 curve */
+static const uint32_t legacy_to_ed25519[32] = {
+	0xe7, 0x81, 0xba, 0x00, 0x55, 0xfb, 0x91, 0x33,
+	0x7d, 0xe5, 0x82, 0xb4, 0x2e, 0x2c, 0x5e, 0x3a,
+	0x81, 0xb0, 0x03, 0xfc, 0x23, 0xf7, 0x84, 0x2d,
+	0x44, 0xf9, 0x5f, 0x9f, 0x0b, 0x12, 0xd9, 0x70,
+};
+
+/** Factor to multiply the X coordinate with to convert from the Ed25519 to the legacy curve */
+static const uint32_t ed25519_to_legacy[32] = {
+	0xe9, 0x68, 0x42, 0xdb, 0xaf, 0x04, 0xb4, 0x40,
+	0xa1, 0xd5, 0x43, 0xf2, 0xf9, 0x38, 0x31, 0x28,
+	0x01, 0x17, 0x05, 0x67, 0x9b, 0x81, 0x61, 0xf8,
+	0xa9, 0x5b, 0x3e, 0x6a, 0x20, 0x67, 0x4b, 0x24,
+};
 
 
 /** Adds two unpacked integers (modulo p) */
-static void add(unsigned int out[32], const unsigned int a[32], const unsigned int b[32]) {
+static void add(uint32_t out[32], const uint32_t a[32], const uint32_t b[32]) {
 	unsigned int j;
-	unsigned int u;
-	u = 0;
-	for (j = 0;j < 31;++j) { u += a[j] + b[j]; out[j] = u & 255; u >>= 8; }
-	u += a[31] + b[31]; out[31] = u;
-}
+	uint32_t u;
 
-/** Subtracts two unpacked integers (modulo p) */
-static void sub(unsigned int out[32], const unsigned int a[32], const unsigned int b[32]) {
-	unsigned int j;
-	unsigned int u;
-	u = 218;
-	for (j = 0;j < 31;++j) {
-		u += a[j] + 65280 - b[j];
+	u = 0;
+
+	for (j = 0; j < 31; j++) {
+		u += a[j] + b[j];
 		out[j] = u & 255;
 		u >>= 8;
 	}
+
+	u += a[31] + b[31];
+	out[31] = u;
+}
+
+/**
+ * Subtracts two unpacked integers (modulo p)
+ *
+ * b must be \em squeezed.
+ */
+static void sub(uint32_t out[32], const uint32_t a[32], const uint32_t b[32]) {
+	unsigned int j;
+	uint32_t u;
+
+	u = 218;
+
+	for (j = 0;j < 31;++j) {
+		u += a[j] + UINT32_C(65280) - b[j];
+		out[j] = u & 255;
+		u >>= 8;
+	}
+
 	u += a[31] - b[31];
 	out[31] = u;
 }
 
-/** Performs carry and reduce on an unpacked integer */
-static void squeeze(unsigned int a[32]) {
+/**
+ * Performs carry and reduce on an unpacked integer
+ *
+ * The result is not always fully reduced, but it will be significantly smaller than \f$ 2p \f$.
+ */
+static void squeeze(uint32_t a[32]) {
 	unsigned int j;
-	unsigned int u;
+	uint32_t u;
+
 	u = 0;
-	for (j = 0;j < 31;++j) { u += a[j]; a[j] = u & 255; u >>= 8; }
-	u += a[31]; a[31] = u & 127;
+
+	for (j = 0;j < 31;++j) {
+		u += a[j];
+		a[j] = u & 255;
+		u >>= 8;
+	}
+
+	u += a[31];
+	a[31] = u & 127;
 	u = 19 * (u >> 7);
-	for (j = 0;j < 31;++j) { u += a[j]; a[j] = u & 255; u >>= 8; }
-	u += a[31]; a[31] = u;
+
+	for (j = 0;j < 31;++j) {
+		u += a[j];
+		a[j] = u & 255;
+		u >>= 8;
+	}
+
+	u += a[31];
+	a[31] = u;
 }
 
+
+static const uint32_t minusp[32] = {
+	19, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 128
+};
+
 /**
  * Ensures that the output of a previous \ref squeeze is fully reduced
  *
- * After a \ref freeze, only the lower byte of each integer part holds a meaningful value
+ * After a \ref freeze, only the lower byte of each integer part holds a meaningful value.
  */
-static void freeze(unsigned int a[32]) {
-	static const unsigned int minusp[32] = {
-		19, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 128
-	};
-
-	unsigned int aorig[32];
+static void freeze(uint32_t a[32]) {
+	uint32_t aorig[32];
 	unsigned int j;
-	unsigned int negative;
+	uint32_t negative;
 
-	for (j = 0; j < 32; j++) aorig[j] = a[j];
+	for (j = 0; j < 32; j++)
+		aorig[j] = a[j];
 	add(a, a, minusp);
 	negative = -((a[31] >> 7) & 1);
-	for (j = 0; j < 32; j++) a[j] ^= negative & (aorig[j] ^ a[j]);
+
+	for (j = 0; j < 32; j++)
+		a[j] ^= negative & (aorig[j] ^ a[j]);
 }
 
-/** Multiplies two unpacked integers (modulo p) */
-static void mult(unsigned int out[32], const unsigned int a[32], const unsigned int b[32]) {
-	unsigned int i;
-	unsigned int j;
-	unsigned int u;
+/**
+ * Returns the parity (lowest bit of the fully reduced value) of a
+ *
+ * The input must be \em squeezed.
+ */
+static int parity(const uint32_t a[32]) {
+	uint32_t b[32];
+
+	add(b, a, minusp);
+	return (a[0] ^ (b[31] >> 7) ^ 1) & 1;
+}
+
+/**
+ * Multiplies two unpacked integers (modulo p)
+ *
+ * The result will be \em squeezed.
+ */
+static void mult(uint32_t out[32], const uint32_t a[32], const uint32_t b[32]) {
+	unsigned int i, j;
+	uint32_t u;
 
 	for (i = 0; i < 32; ++i) {
 		u = 0;
-		for (j = 0;j <= i;++j) u += a[j] * b[i - j];
-		for (j = i + 1;j < 32;++j) u += 38 * a[j] * b[i + 32 - j];
+
+		for (j = 0; j <= i; j++)
+			u += a[j] * b[i - j];
+
+		for (j = i + 1; j < 32; j++)
+			u += 38 * a[j] * b[i + 32 - j];
+
 		out[i] = u;
 	}
+
 	squeeze(out);
 }
 
-/** Multiplies an unpacked integer with a small integer (modulo p) */
-static void mult_int(unsigned int out[32], unsigned int n, const unsigned int a[32]) {
+/**
+ * Multiplies an unpacked integer with a small integer (modulo p)
+ *
+ * The result will be \em squeezed.
+ */
+static void mult_int(uint32_t out[32], uint32_t n, const uint32_t a[32]) {
 	unsigned int j;
-	unsigned int u;
+	uint32_t u;
 
 	u = 0;
-	for (j = 0;j < 31;++j) { u += n * a[j]; out[j] = u & 255; u >>= 8; }
+
+	for (j = 0; j < 31; j++) {
+		u += n * a[j];
+		out[j] = u & 255;
+		u >>= 8;
+	}
+
 	u += n * a[31]; out[31] = u & 127;
 	u = 19 * (u >> 7);
-	for (j = 0;j < 31;++j) { u += out[j]; out[j] = u & 255; u >>= 8; }
-	u += out[j]; out[j] = u;
+
+	for (j = 0; j < 31; j++) {
+		u += out[j];
+		out[j] = u & 255;
+		u >>= 8;
+	}
+
+	u += out[j];
+	out[j] = u;
 }
 
-/** Squares an unpacked integer */
-static void square(unsigned int out[32], const unsigned int a[32]) {
-	unsigned int i;
-	unsigned int j;
-	unsigned int u;
+/**
+ * Squares an unpacked integer
+ *
+ * The result will be sqeezed.
+ */
+static void square(uint32_t out[32], const uint32_t a[32]) {
+	unsigned int i, j;
+	uint32_t u;
 
-	for (i = 0; i < 32; ++i) {
+	for (i = 0; i < 32; i++) {
 		u = 0;
-		for (j = 0;j < i - j;++j) u += a[j] * a[i - j];
-		for (j = i + 1;j < i + 32 - j;++j) u += 38 * a[j] * a[i + 32 - j];
+
+		for (j = 0; j < i - j; j++)
+			u += a[j] * a[i - j];
+
+		for (j = i + 1; j < i + 32 - j; j++)
+			u += 38 * a[j] * a[i + 32 - j];
+
 		u *= 2;
+
 		if ((i & 1) == 0) {
 			u += a[i / 2] * a[i / 2];
 			u += 38 * a[i / 2 + 16] * a[i / 2 + 16];
 		}
+
 		out[i] = u;
 	}
+
 	squeeze(out);
 }
 
 /** Checks for the equality of two unpacked integers */
-static int check_equal(const unsigned int x[32], const unsigned int y[32]) {
-	unsigned int differentbits = 0;
+static int check_equal(const uint32_t x[32], const uint32_t y[32]) {
+	uint32_t differentbits = 0;
 	int i;
 
 	for (i = 0; i < 32; i++) {
@@ -186,12 +354,12 @@ static int check_equal(const unsigned int x[32], const unsigned int y[32]) {
 }
 
 /**
- * Checks if an unpacked integer equals zero
+ * Checks if an unpacked integer equals zero (modulo p)
  *
- * The intergers must be must be \ref squeeze "squeezed" before.
+ * The integer must be squeezed before.
  */
-static int check_zero(const unsigned int x[32]) {
-	static const unsigned int p[32] = {
+static int check_zero(const uint32_t x[32]) {
+	static const uint32_t p[32] = {
 		0xed, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -202,10 +370,10 @@ static int check_zero(const unsigned int x[32]) {
 }
 
 /** Copies r to out when b == 0, s when b == 1 */
-static void selectw(ecc_25519_work_t *out, const ecc_25519_work_t *r, const ecc_25519_work_t *s, unsigned int b) {
+static void selectw(ecc_25519_work_t *out, const ecc_25519_work_t *r, const ecc_25519_work_t *s, uint32_t b) {
 	unsigned int j;
-	unsigned int t;
-	unsigned int bminus1;
+	uint32_t t;
+	uint32_t bminus1;
 
 	bminus1 = b - 1;
 	for (j = 0; j < 32; ++j) {
@@ -224,10 +392,10 @@ static void selectw(ecc_25519_work_t *out, const ecc_25519_work_t *r, const ecc_
 }
 
 /** Copies r to out when b == 0, s when b == 1 */
-static void select(unsigned int out[32], const unsigned int r[32], const unsigned int s[32], unsigned int b) {
+static void select(uint32_t out[32], const uint32_t r[32], const uint32_t s[32], uint32_t b) {
 	unsigned int j;
-	unsigned int t;
-	unsigned int bminus1;
+	uint32_t t;
+	uint32_t bminus1;
 
 	bminus1 = b - 1;
 	for (j = 0;j < 32;++j) {
@@ -241,15 +409,8 @@ static void select(unsigned int out[32], const unsigned int r[32], const unsigne
  *
  * If the given integer has no square root, 0 is returned, 1 otherwise.
  */
-static int square_root(unsigned int out[32], const unsigned int z[32]) {
-	static const unsigned int minus1[32] = {
-		0xec, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f
-	};
-
-	static const unsigned int rho_s[32] = {
+static int square_root(uint32_t out[32], const uint32_t z[32]) {
+	static const uint32_t rho_s[32] = {
 		0xb0, 0xa0, 0x0e, 0x4a, 0x27, 0x1b, 0xee, 0xc4,
 		0x78, 0xe4, 0x2f, 0xad, 0x06, 0x18, 0x43, 0x2f,
 		0xa7, 0xd7, 0xfb, 0x3d, 0x99, 0x00, 0x4d, 0x2b,
@@ -258,18 +419,18 @@ static int square_root(unsigned int out[32], const unsigned int z[32]) {
 
 	/* raise z to power (2^252-2), check if power (2^253-5) equals -1 */
 
-	unsigned int z2[32];
-	unsigned int z9[32];
-	unsigned int z11[32];
-	unsigned int z2_5_0[32];
-	unsigned int z2_10_0[32];
-	unsigned int z2_20_0[32];
-	unsigned int z2_50_0[32];
-	unsigned int z2_100_0[32];
-	unsigned int t0[32];
-	unsigned int t1[32];
-	unsigned int z2_252_1[32];
-	unsigned int z2_252_1_rho_s[32];
+	uint32_t z2[32];
+	uint32_t z9[32];
+	uint32_t z11[32];
+	uint32_t z2_5_0[32];
+	uint32_t z2_10_0[32];
+	uint32_t z2_20_0[32];
+	uint32_t z2_50_0[32];
+	uint32_t z2_100_0[32];
+	uint32_t t0[32];
+	uint32_t t1[32];
+	uint32_t z2_252_1[32];
+	uint32_t z2_252_1_rho_s[32];
 	int i;
 
 	/* 2 */ square(z2, z);
@@ -335,17 +496,17 @@ static int square_root(unsigned int out[32], const unsigned int z[32]) {
 }
 
 /** Computes the reciprocal of an unpacked integer (in the prime field modulo p) */
-static void recip(unsigned int out[32], const unsigned int z[32]) {
-	unsigned int z2[32];
-	unsigned int z9[32];
-	unsigned int z11[32];
-	unsigned int z2_5_0[32];
-	unsigned int z2_10_0[32];
-	unsigned int z2_20_0[32];
-	unsigned int z2_50_0[32];
-	unsigned int z2_100_0[32];
-	unsigned int t0[32];
-	unsigned int t1[32];
+static void recip(uint32_t out[32], const uint32_t z[32]) {
+	uint32_t z2[32];
+	uint32_t z9[32];
+	uint32_t z11[32];
+	uint32_t z2_5_0[32];
+	uint32_t z2_10_0[32];
+	uint32_t z2_20_0[32];
+	uint32_t z2_50_0[32];
+	uint32_t z2_100_0[32];
+	uint32_t t0[32];
+	uint32_t t1[32];
 	int i;
 
 	/* 2 */ square(z2, z);
@@ -401,10 +562,37 @@ static void recip(unsigned int out[32], const unsigned int z[32]) {
 	/* 2^255 - 21 */ mult(out, t1, z11);
 }
 
-/** Loads a point with given coordinates into its unpacked representation */
-int ecc_25519_load_xy(ecc_25519_work_t *out, const ecc_int256_t *x, const ecc_int256_t *y) {
+/**
+ * Checks if the X and Y coordinates of a work structure represent a valid point of the curve
+ *
+ * Also fills in the T coordinate.
+ */
+static int check_load_xy(ecc_25519_work_t *val) {
+	uint32_t X2[32], Y2[32], dX2[32], dX2Y2[32], Y2_X2[32], Y2_X2_1[32], r[32];
+
+	/* Check validity */
+	square(X2, val->X);
+	square(Y2, val->Y);
+
+	mult(dX2, d, X2);
+	mult(dX2Y2, dX2, Y2);
+
+	sub(Y2_X2, Y2, X2);
+	sub(Y2_X2_1, Y2_X2, one);
+
+	sub(r, Y2_X2_1, dX2Y2);
+	squeeze(r);
+
+	if (!check_zero(r))
+	    return 0;
+
+	mult(val->T, val->X, val->Y);
+
+	return 1;
+}
+
+int ecc_25519_load_xy_ed25519(ecc_25519_work_t *out, const ecc_int256_t *x, const ecc_int256_t *y) {
 	int i;
-	unsigned int X2[32], Y2[32], aX2[32], dX2[32], dX2Y2[32], aX2_Y2[32], _1_dX2Y2[32], r[32];
 
 	for (i = 0; i < 32; i++) {
 		out->X[i] = x->p[i];
@@ -412,34 +600,31 @@ int ecc_25519_load_xy(ecc_25519_work_t *out, const ecc_int256_t *x, const ecc_in
 		out->Z[i] = (i == 0);
 	}
 
-	/* Check validity */
-	square(X2, out->X);
-	square(Y2, out->Y);
-	mult_int(aX2, 486664, X2);
-	mult_int(dX2, 486660, X2);
-	mult(dX2Y2, dX2, Y2);
-	add(aX2_Y2, aX2, Y2);
-	add(_1_dX2Y2, one, dX2Y2);
-	sub(r, aX2_Y2, _1_dX2Y2);
-	squeeze(r);
-
-	if (!check_zero(r))
-	    return 0;
-
-	mult(out->T, out->X, out->Y);
-
-	return 1;
+	return check_load_xy(out);
 }
 
-/**
- * Stores a point's x and y coordinates
- *
- * \param x Returns the x coordinate of the point. May be NULL.
- * \param y Returns the y coordinate of the point. May be NULL.
- * \param in The unpacked point to store.
- */
-void ecc_25519_store_xy(ecc_int256_t *x, ecc_int256_t *y, const ecc_25519_work_t *in) {
-	unsigned int X[32], Y[32], Z[32];
+int ecc_25519_load_xy_legacy(ecc_25519_work_t *out, const ecc_int256_t *x, const ecc_int256_t *y) {
+	int i;
+	uint32_t tmp[32];
+
+	for (i = 0; i < 32; i++) {
+		tmp[i] = x->p[i];
+		out->Y[i] = y->p[i];
+		out->Z[i] = (i == 0);
+	}
+
+	mult(out->X, tmp, legacy_to_ed25519);
+
+	return check_load_xy(out);
+}
+
+int ecc_25519_load_xy(ecc_25519_work_t *out, const ecc_int256_t *x, const ecc_int256_t *y) {
+	return ecc_25519_load_xy_legacy(out, x, y);
+}
+
+
+void ecc_25519_store_xy_ed25519(ecc_int256_t *x, ecc_int256_t *y, const ecc_25519_work_t *in) {
+	uint32_t X[32], Y[32], Z[32];
 	int i;
 
 	recip(Z, in->Z);
@@ -459,22 +644,80 @@ void ecc_25519_store_xy(ecc_int256_t *x, ecc_int256_t *y, const ecc_25519_work_t
 	}
 }
 
-/** Loads a packed point into its unpacked representation */
-int ecc_25519_load_packed(ecc_25519_work_t *out, const ecc_int256_t *in) {
+void ecc_25519_store_xy_legacy(ecc_int256_t *x, ecc_int256_t *y, const ecc_25519_work_t *in) {
+	uint32_t X[32], tmp[32], Y[32], Z[32];
 	int i;
-	unsigned int X2[32] /* X^2 */, aX2[32] /* aX^2 */, dX2[32] /* dX^2 */, _1_aX2[32] /* 1-aX^2 */, _1_dX2[32] /* 1-aX^2 */;
-	unsigned int _1_1_dX2[32]  /* 1/(1-aX^2) */, Y2[32] /* Y^2 */, Y[32], Yt[32];
+
+	recip(Z, in->Z);
+
+	if (x) {
+		mult(tmp, Z, in->X);
+		mult(X, tmp, ed25519_to_legacy);
+		freeze(X);
+		for (i = 0; i < 32; i++)
+			x->p[i] = X[i];
+	}
+
+	if (y) {
+		mult(Y, Z, in->Y);
+		freeze(Y);
+		for (i = 0; i < 32; i++)
+			y->p[i] = Y[i];
+	}
+}
+
+void ecc_25519_store_xy(ecc_int256_t *x, ecc_int256_t *y, const ecc_25519_work_t *in) {
+	ecc_25519_store_xy_legacy(x, y, in);
+}
+
+
+int ecc_25519_load_packed_ed25519(ecc_25519_work_t *out, const ecc_int256_t *in) {
+	int i;
+	uint32_t Y2[32] /* Y^2 */, dY2[32] /* dY^2 */, Y2_1[32] /* Y^2-1 */, dY2_1[32] /* dY^2+1 */, _1_dY2_1[32] /* 1/(dY^2+1) */;
+	uint32_t X2[32] /* X^2 */, X[32], Xt[32];
 
 	for (i = 0; i < 32; i++) {
-		out->X[i] = in->p[i];
+		out->Y[i] = in->p[i];
 		out->Z[i] = (i == 0);
 	}
 
-	out->X[31] &= 0x7f;
+	out->Y[31] &= 0x7f;
 
-	square(X2, out->X);
-	mult_int(aX2, 486664, X2);
-	mult_int(dX2, 486660, X2);
+	square(Y2, out->Y);
+	mult(dY2, d, Y2);
+	sub(Y2_1, Y2, one);
+	add(dY2_1, dY2, one);
+	recip(_1_dY2_1, dY2_1);
+	mult(X2, Y2_1, _1_dY2_1);
+
+	if (!square_root(X, X2))
+		return 0;
+
+	/* No squeeze is necessary after subtractions from zero if the subtrahend is squeezed */
+	sub(Xt, zero, X);
+
+	select(out->X, X, Xt, (in->p[31] >> 7) ^ parity(X));
+
+	mult(out->T, out->X, out->Y);
+
+	return 1;
+}
+
+int ecc_25519_load_packed_legacy(ecc_25519_work_t *out, const ecc_int256_t *in) {
+	int i;
+	uint32_t X2[32] /* X^2 */, aX2[32] /* aX^2 */, dX2[32] /* dX^2 */, _1_aX2[32] /* 1-aX^2 */, _1_dX2[32] /* 1-aX^2 */;
+	uint32_t _1_1_dX2[32]  /* 1/(1-aX^2) */, Y2[32] /* Y^2 */, Y[32], Yt[32], X_legacy[32];
+
+	for (i = 0; i < 32; i++) {
+		X_legacy[i] = in->p[i];
+		out->Z[i] = (i == 0);
+	}
+
+	X_legacy[31] &= 0x7f;
+
+	square(X2, X_legacy);
+	mult_int(aX2, UINT32_C(486664), X2);
+	mult_int(dX2, UINT32_C(486660), X2);
 	sub(_1_aX2, one, aX2);
 	sub(_1_dX2, one, dX2);
 	recip(_1_1_dX2, _1_dX2);
@@ -483,26 +726,43 @@ int ecc_25519_load_packed(ecc_25519_work_t *out, const ecc_int256_t *in) {
 	if (!square_root(Y, Y2))
 		return 0;
 
+	/* No squeeze is necessary after subtractions from zero if the subtrahend is squeezed */
 	sub(Yt, zero, Y);
 
-	select(out->Y, Y, Yt, (in->p[31] >> 7) ^ (Y[0] & 1));
+	select(out->Y, Y, Yt, (in->p[31] >> 7) ^ parity(Y));
 
+	mult(out->X, X_legacy, legacy_to_ed25519);
 	mult(out->T, out->X, out->Y);
 
 	return 1;
 }
 
-/** Stores a point into its packed representation */
-void ecc_25519_store_packed(ecc_int256_t *out, const ecc_25519_work_t *in) {
+int ecc_25519_load_packed(ecc_25519_work_t *out, const ecc_int256_t *in) {
+	return ecc_25519_load_packed_legacy(out, in);
+}
+
+
+void ecc_25519_store_packed_ed25519(ecc_int256_t *out, const ecc_25519_work_t *in) {
+	ecc_int256_t x;
+
+	ecc_25519_store_xy_ed25519(&x, out, in);
+	out->p[31] |= (x.p[0] << 7);
+}
+
+void ecc_25519_store_packed_legacy(ecc_int256_t *out, const ecc_25519_work_t *in) {
 	ecc_int256_t y;
 
-	ecc_25519_store_xy(out, &y, in);
+	ecc_25519_store_xy_legacy(out, &y, in);
 	out->p[31] |= (y.p[0] << 7);
 }
 
-/** Checks if a point is the identity element of the Elliptic Curve group */
+void ecc_25519_store_packed(ecc_int256_t *out, const ecc_25519_work_t *in) {
+	ecc_25519_store_packed_legacy(out, in);
+}
+
+
 int ecc_25519_is_identity(const ecc_25519_work_t *in) {
-	unsigned int Y_Z[32];
+	uint32_t Y_Z[32];
 
 	sub(Y_Z, in->Y, in->Z);
 	squeeze(Y_Z);
@@ -510,71 +770,117 @@ int ecc_25519_is_identity(const ecc_25519_work_t *in) {
 	return (check_zero(in->X)&check_zero(Y_Z));
 }
 
-/**
- * Doubles a point of the Elliptic Curve
- *
- * ecc_25519_double(out, in) is equivalent to ecc_25519_add(out, in, in), but faster.
- *
- * The same pointers may be used for input and output.
- */
+void ecc_25519_negate(ecc_25519_work_t *out, const ecc_25519_work_t *in) {
+	int i;
+
+	for (i = 0; i < 32; i++) {
+		out->Y[i] = in->Y[i];
+		out->Z[i] = in->Z[i];
+	}
+
+	/* No squeeze is necessary after subtractions from zero if the subtrahend is squeezed */
+	sub(out->X, zero, in->X);
+	sub(out->T, zero, in->T);
+}
+
 void ecc_25519_double(ecc_25519_work_t *out, const ecc_25519_work_t *in) {
-	unsigned int A[32], B[32], C[32], D[32], E[32], F[32], G[32], H[32], t0[32], t1[32], t2[32], t3[32];
+	uint32_t A[32], B[32], C[32], D[32], E[32], F[32], G[32], H[32], t0[32], t1[32];
 
 	square(A, in->X);
+
 	square(B, in->Y);
+
 	square(t0, in->Z);
 	mult_int(C, 2, t0);
-	mult_int(D, 486664, A);
-	add(t1, in->X, in->Y);
-	square(t2, t1);
-	sub(t3, t2, A); squeeze(t3);
-	sub(E, t3, B);
-	add(G, D, B); squeeze(G);
+
+	sub(D, zero, A);
+
+	add(t0, in->X, in->Y);
+	square(t1, t0);
+	sub(t0, t1, A);
+	sub(E, t0, B);
+
+	add(G, D, B);
 	sub(F, G, C);
 	sub(H, D, B);
+
 	mult(out->X, E, F);
 	mult(out->Y, G, H);
 	mult(out->T, E, H);
 	mult(out->Z, F, G);
 }
 
-/**
- * Adds two points of the Elliptic Curve
- *
- * The same pointers may be used for input and output.
- */
 void ecc_25519_add(ecc_25519_work_t *out, const ecc_25519_work_t *in1, const ecc_25519_work_t *in2) {
-	unsigned int A[32], B[32], C[32], D[32], E[32], F[32], G[32], H[32], t0[32], t1[32], t2[32], t3[32], t4[32], t5[32];
+	const uint32_t j = UINT32_C(60833);
+	const uint32_t k = UINT32_C(121665);
+	uint32_t A[32], B[32], C[32], D[32], E[32], F[32], G[32], H[32], t0[32], t1[32];
 
-	mult(A, in1->X, in2->X);
-	mult(B, in1->Y, in2->Y);
-	mult_int(t0, 486660, in2->T);
+	sub(t0, in1->Y, in1->X);
+	mult_int(t1, j, t0);
+	sub(t0, in2->Y, in2->X);
+	mult(A, t0, t1);
+
+	add(t0, in1->Y, in1->X);
+	mult_int(t1, j, t0);
+	add(t0, in2->Y, in2->X);
+	mult(B, t0, t1);
+
+	mult_int(t0, k, in2->T);
 	mult(C, in1->T, t0);
-	mult(D, in1->Z, in2->Z);
-	add(t1, in1->X, in1->Y);
-	add(t2, in2->X, in2->Y);
-	mult(t3, t1, t2);
-	sub(t4, t3, A); squeeze(t4);
-	sub(E, t4, B);
-	sub(F, D, C);
-	add(G, D, C);
-	mult_int(t5, 486664, A);
-	sub(H, B, t5);
+
+	mult_int(t0, 2*j, in2->Z);
+	mult(D, in1->Z, t0);
+
+	sub(E, B, A);
+	add(F, D, C);
+	sub(G, D, C);
+	add(H, B, A);
+
 	mult(out->X, E, F);
 	mult(out->Y, G, H);
 	mult(out->T, E, H);
 	mult(out->Z, F, G);
 }
 
-/**
- * Does a scalar multiplication of a point of the Elliptic Curve with an integer of a given bit length
- *
- * To speed up scalar multiplication when it is known that not the whole 256 bits of the scalar
- * are used. The bit length should always be a constant and not computed at runtime to ensure
- * that no timing attacks are possible.
- *
- * The same pointers may be used for input and output.
- **/
+/** Adds two points of the Elliptic Curve, assuming that in2->Z == 1 */
+static void ecc_25519_add1(ecc_25519_work_t *out, const ecc_25519_work_t *in1, const ecc_25519_work_t *in2) {
+	const uint32_t j = UINT32_C(60833);
+	const uint32_t k = UINT32_C(121665);
+	uint32_t A[32], B[32], C[32], D[32], E[32], F[32], G[32], H[32], t0[32], t1[32];
+
+	sub(t0, in1->Y, in1->X);
+	mult_int(t1, j, t0);
+	sub(t0, in2->Y, in2->X);
+	mult(A, t0, t1);
+
+	add(t0, in1->Y, in1->X);
+	mult_int(t1, j, t0);
+	add(t0, in2->Y, in2->X);
+	mult(B, t0, t1);
+
+	mult_int(t0, k, in2->T);
+	mult(C, in1->T, t0);
+
+	mult_int(D, 2*j, in1->Z);
+
+	sub(E, B, A);
+	add(F, D, C);
+	sub(G, D, C);
+	add(H, B, A);
+
+	mult(out->X, E, F);
+	mult(out->Y, G, H);
+	mult(out->T, E, H);
+	mult(out->Z, F, G);
+}
+
+void ecc_25519_sub(ecc_25519_work_t *out, const ecc_25519_work_t *in1, const ecc_25519_work_t *in2) {
+	ecc_25519_work_t in2_neg;
+
+	ecc_25519_negate(&in2_neg, in2);
+	ecc_25519_add(out, in1, &in2_neg);
+}
+
 void ecc_25519_scalarmult_bits(ecc_25519_work_t *out, const ecc_int256_t *n, const ecc_25519_work_t *base, unsigned bits) {
 	ecc_25519_work_t Q2, Q2p;
 	ecc_25519_work_t cur = ecc_25519_work_identity;
@@ -595,31 +901,30 @@ void ecc_25519_scalarmult_bits(ecc_25519_work_t *out, const ecc_int256_t *n, con
 	*out = cur;
 }
 
-/**
- * Does a scalar multiplication of a point of the Elliptic Curve with an integer
- *
- * The same pointers may be used for input and output.
- **/
 void ecc_25519_scalarmult(ecc_25519_work_t *out, const ecc_int256_t *n, const ecc_25519_work_t *base) {
 	ecc_25519_scalarmult_bits(out, n, base, 256);
 }
 
-/**
- * Does a scalar multiplication of the default base point (generator element) of the Elliptic Curve with an integer of a given bit length
- *
- * The order of the base point is \f$ 2^{252} + 27742317777372353535851937790883648493 \f$.
- *
- * See the notes about \ref ecc_25519_scalarmult_bits before using this function.
- */
 void ecc_25519_scalarmult_base_bits(ecc_25519_work_t *out, const ecc_int256_t *n, unsigned bits) {
-	ecc_25519_scalarmult_bits(out, n, &ecc_25519_work_default_base, bits);
+	ecc_25519_work_t Q2, Q2p;
+	ecc_25519_work_t cur = ecc_25519_work_identity;
+	int b, pos;
+
+	if (bits > 256)
+		bits = 256;
+
+	for (pos = bits - 1; pos >= 0; --pos) {
+		b = n->p[pos / 8] >> (pos & 7);
+		b &= 1;
+
+		ecc_25519_double(&Q2, &cur);
+		ecc_25519_add1(&Q2p, &Q2, &ecc_25519_work_default_base);
+		selectw(&cur, &Q2, &Q2p, b);
+	}
+
+	*out = cur;
 }
 
-/**
- * Does a scalar multiplication of the default base point (generator element) of the Elliptic Curve with an integer
- *
- * The order of the base point is \f$ 2^{252} + 27742317777372353535851937790883648493 \f$.
- */
 void ecc_25519_scalarmult_base(ecc_25519_work_t *out, const ecc_int256_t *n) {
-	ecc_25519_scalarmult(out, n, &ecc_25519_work_default_base);
+	ecc_25519_scalarmult_base_bits(out, n, 256);
 }
diff --git a/src/ec25519_gf.c b/src/ec25519_gf.c
index 4914fa7..11d2989 100644
--- a/src/ec25519_gf.c
+++ b/src/ec25519_gf.c
@@ -25,26 +25,23 @@
 */
 
 /** \file
-  Simple finite field operations on the prime field \f$ F_q \f$ for
-  \f$ q = 2^{252} + 27742317777372353535851937790883648493 \f$, which
-  is the order of the base point used for ec25519
-*/
+ * Simple finite field operations on the prime field \f$ F_q \f$ for
+ * \f$ q = 2^{252} + 27742317777372353535851937790883648493 \f$, which
+ * is the order of the base point used for ec25519
+ *
+ * Doxygen comments for public APIs can be found in the public header file.
+ */
 
 #include <libuecc/ecc.h>
 
 
-/** Checks if the highest bit of an unsigned integer is set */
+/** Checks if the highest bit of an uint32_teger is set */
 #define IS_NEGATIVE(n) ((int)((((unsigned)n) >> (8*sizeof(n)-1))&1))
 
 /** Performs an arithmetic right shift */
 #define ASR(n,s) (((n) >> s)|(IS_NEGATIVE(n)*((unsigned)-1) << (8*sizeof(n)-s)))
 
 
-/**
- * The order of the prime field
- *
- * The order is \f$ 2^{252} + 27742317777372353535851937790883648493 \f$.
- */
 const ecc_int256_t ecc_25519_gf_order = {{
 	0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
 	0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
@@ -53,15 +50,15 @@ const ecc_int256_t ecc_25519_gf_order = {{
 }};
 
 /** An internal alias for \ref ecc_25519_gf_order */
-static const unsigned char *q = ecc_25519_gf_order.p;
+static const uint8_t *q = ecc_25519_gf_order.p;
 
 /**
  * Copies the content of r into out if b == 0, the contents of s if b == 1
  */
-static void select(unsigned char out[32], const unsigned char r[32], const unsigned char s[32], unsigned int b) {
+static void select(uint8_t out[32], const uint8_t r[32], const uint8_t s[32], uint32_t b) {
 	unsigned int j;
-	unsigned int t;
-	unsigned int bminus1;
+	uint8_t t;
+	uint8_t bminus1;
 
 	bminus1 = b - 1;
 	for (j = 0;j < 32;++j) {
@@ -70,11 +67,10 @@ static void select(unsigned char out[32], const unsigned char r[32], const unsig
 	}
 }
 
-/** Checks if an integer is equal to zero (after reduction) */
 int ecc_25519_gf_is_zero(const ecc_int256_t *in) {
 	int i;
 	ecc_int256_t r;
-	unsigned int bits = 0;
+	uint32_t bits = 0;
 
 	ecc_25519_gf_reduce(&r, in);
 
@@ -84,14 +80,9 @@ int ecc_25519_gf_is_zero(const ecc_int256_t *in) {
 	return (((bits-1)>>8) & 1);
 }
 
-/**
- * Adds two integers as Galois field elements
- *
- * The same pointers may be used for input and output.
- */
 void ecc_25519_gf_add(ecc_int256_t *out, const ecc_int256_t *in1, const ecc_int256_t *in2) {
 	unsigned int j;
-	unsigned int u;
+	uint32_t u;
 	int nq = 1 - (in1->p[31]>>4) - (in2->p[31]>>4);
 
 	u = 0;
@@ -103,14 +94,9 @@ void ecc_25519_gf_add(ecc_int256_t *out, const ecc_int256_t *in1, const ecc_int2
 	}
 }
 
-/**
- * Subtracts two integers as Galois field elements
- *
- * The same pointers may be used for input and output.
- */
 void ecc_25519_gf_sub(ecc_int256_t *out, const ecc_int256_t *in1, const ecc_int256_t *in2) {
 	unsigned int j;
-	unsigned int u;
+	uint32_t u;
 	int nq = 8 - (in1->p[31]>>4) + (in2->p[31]>>4);
 
 	u = 0;
@@ -123,11 +109,11 @@ void ecc_25519_gf_sub(ecc_int256_t *out, const ecc_int256_t *in1, const ecc_int2
 }
 
 /** Reduces an integer to a unique representation in the range \f$ [0,q-1] \f$ */
-static void reduce(unsigned char a[32]) {
+static void reduce(uint8_t a[32]) {
 	unsigned int j;
-	unsigned int nq = a[31] >> 4;
-	unsigned int u1, u2;
-	unsigned char out1[32], out2[32];
+	uint32_t nq = a[31] >> 4;
+	uint32_t u1, u2;
+	uint8_t out1[32], out2[32];
 
 	u1 = u2 = 0;
 	for (j = 0; j < 31; ++j) {
@@ -145,11 +131,6 @@ static void reduce(unsigned char a[32]) {
 	select(a, out1, out2, IS_NEGATIVE(u1));
 }
 
-/**
- * Reduces an integer to a unique representation in the range \f$ [0,q-1] \f$
- *
- * The same pointers may be used for input and output.
- */
 void ecc_25519_gf_reduce(ecc_int256_t *out, const ecc_int256_t *in) {
 	int i;
 
@@ -160,10 +141,10 @@ void ecc_25519_gf_reduce(ecc_int256_t *out, const ecc_int256_t *in) {
 }
 
 /** Montgomery modular multiplication algorithm */
-static void montgomery(unsigned char out[32], const unsigned char a[32], const unsigned char b[32]) {
+static void montgomery(uint8_t out[32], const uint8_t a[32], const uint8_t b[32]) {
 	unsigned int i, j;
-	unsigned int nq;
-	unsigned int u;
+	uint32_t nq;
+	uint32_t u;
 
 	for (i = 0; i < 32; i++)
 		out[i] = 0;
@@ -183,22 +164,17 @@ static void montgomery(unsigned char out[32], const unsigned char a[32], const u
 	}
 }
 
-/**
- * Multiplies two integers as Galois field elements
- *
- * The same pointers may be used for input and output.
- */
 void ecc_25519_gf_mult(ecc_int256_t *out, const ecc_int256_t *in1, const ecc_int256_t *in2) {
 	/* 2^512 mod q */
-	static const unsigned char C[32] = {
+	static const uint8_t C[32] = {
 		0x01, 0x0f, 0x9c, 0x44, 0xe3, 0x11, 0x06, 0xa4,
 		0x47, 0x93, 0x85, 0x68, 0xa7, 0x1b, 0x0e, 0xd0,
 		0x65, 0xbe, 0xf5, 0x17, 0xd2, 0x73, 0xec, 0xce,
 		0x3d, 0x9a, 0x30, 0x7c, 0x1b, 0x41, 0x99, 0x03
 	};
 
-	unsigned char B[32];
-	unsigned char R[32];
+	uint8_t B[32];
+	uint8_t R[32];
 	unsigned int i;
 
 	for (i = 0; i < 32; i++)
@@ -210,18 +186,13 @@ void ecc_25519_gf_mult(ecc_int256_t *out, const ecc_int256_t *in1, const ecc_int
 	montgomery(out->p, R, C);
 }
 
-/**
- * Computes the reciprocal of a Galois field element
- *
- * The same pointers may be used for input and output.
- */
 void ecc_25519_gf_recip(ecc_int256_t *out, const ecc_int256_t *in) {
-	static const unsigned char C[32] = {
+	static const uint8_t C[32] = {
 		0x01
 	};
 
-	unsigned char A[32], B[32];
-	unsigned char R1[32], R2[32];
+	uint8_t A[32], B[32];
+	uint8_t R1[32], R2[32];
 	int use_r2 = 0;
 	unsigned int i, j;
 
@@ -233,7 +204,7 @@ void ecc_25519_gf_recip(ecc_int256_t *out, const ecc_int256_t *in) {
 	reduce(A);
 
 	for (i = 0; i < 32; i++) {
-		unsigned char c;
+		uint8_t c;
 
 		if (i == 0)
 			c = 0xeb; /* q[0] - 2 */
@@ -268,15 +239,6 @@ void ecc_25519_gf_recip(ecc_int256_t *out, const ecc_int256_t *in) {
 	montgomery(out->p, R2, C);
 }
 
-/**
- * Ensures some properties of a Galois field element to make it fit for use as a secret key
- *
- * This sets the 255th bit and clears the 256th and the bottom three bits (so the key
- * will be a multiple of 8). See Daniel J. Bernsteins paper "Curve25519: new Diffie-Hellman speed records."
- * for the rationale of this.
- *
- * The same pointers may be used for input and output.
- */
 void ecc_25519_gf_sanitize_secret(ecc_int256_t *out, const ecc_int256_t *in) {
 	int i;