Pagina's

2013/11/17

Uniform random numbers in a range, part 2

/*

Source code of System.Random is available at: CodePlex/Random.cs

System.Random is fast, basically a new random number is generated by a subtraction
of two numbers. A straightforward copy of the source code is two times slower 
compared to using rand.Next directly.
A way to speed it up: the codeplex code uses a multiplication by a double type. 
The multiplication can be replaced by a shift  and an "if" statement when 30 
random bits are needed, very occasionally an addition  "+ 1" (and an "& 1") are
necessary, that's what "Next30()" does in, voila:

    Fast Random Range (CodePlex)

Other improvements over the previous version:
-a 64 bits buffer, instead of a 32 bits one.
 Operations like "uint + uint" are as fast as "ulong + uint", 
 same for: " -  , ^ , | , << , >> " 
-Random numbers were computed bit by bit, now the bits are taken at once,
 some bit twiddling recycles unused random bits. 

Finally: A probably very true scaled (pseudo) random number generator, 
based on the .Net4 RNGCryptoServiceProvider class:
".. rng = new  RNGCryptoServiceProvider".
The "rng.Getbytes(..)" method fills a byte array, 
one byte as fast as 40 bytes, 40 bytes seems to be an optimum.
"Buffer.BlockCopy" copies the byte array to a uint array.
The uint array is used to fill a 64 random bits buffer, etc.
"uint x = g_uint_c(uint u)" gives a random value for x,
such that: 0 <= x <= u <= uint.MaxValue


Results:
    
    10^7 times rand.Next() takes: 107 ms (~11 ns per random number) 
    10^7 times rand.Next(int someValue) takes: 236 ms (~24 ns ...)
                                                 
    |--------|-------------------------|         rand.Next()    11 ns  
    |        |       ms 10^7 *         |         rand.Next(i)   24 ns
    |        |-----------|-------------|         gen_uint(u)    50 ns, mean  ~35 ns
    |      u |gen_uint(u)| g_uint_c(u) |         g_uint_c(u)   280 ns, mean ~250 ns   
    |--------|-----|-----|------|------|
    |      0 |  24 |     |   24 |      |          
    |      1 |     |  73 |      |  139 |          
    |      2 | 124 |     |  296 |      |           
    |      3 |     |  78 |      |  211 |       
    |      4 | 242 |     |  518 |      |     
    |      7 |     |  83 |      |  282 |               
    |      8 | 290 |     |  680 |      |      
    |  2^7-1 |     | 102 |      |  562 |              
    |  2^7-0 | 380 |     | 1079 |      |          
    | 2^15-1 |     | 130 |      | 1123 |          
    | 2^15-0 | 414 |     | 1634 |      |               
    | 2^30-1 |     | 193 |      | 2164 |
    | 2^30-0 | 494 |     | 2672 |      |
    | 2^31-1 |     | 210 |      | 2251 |
    | 2^31-0 | 461 |     | 2747 |      |
    | 2^32-1 |     | 211 |      | 2307 |
    |--------|-----|-----|------|------|


http://www.cacert.at/cgi-bin/rngresults

find: "P_P" , hover mouse over "Net4 System.Random" for the code.
 Random ~ gen_uint(u) 
    CSP ~ g_uint_c(u)

|--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
| Gene-  |  Speed  | Upload | Entropy  | Birthday | Matrix | 6*8 Ma- | Minimum  | Random   | Squeeze  | Overlap- |
| rator  | (MiB/s) |  Size  | (->8)    | Spacing  | Ranks  | trix R  | Distance | Spheres  |          | ping Sums|
|--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
| Random |   370   | 17 MiB | 7.999989 | 0.000000 |  0.392 |  0.136  | 0.063920 | 0.905097 | 0.098440 | 0.005467 |
|    CSP |    34   | 15 MiB | 7.999987 | 0.032102 |  0.084 |  0.942  | 0.894277 | 0.036221 | 0.706538 | 0.023523 |
|--------|---------|--------|----------|----------|--------|---------|----------|----------|----------|----------|
   
*/

using System;
using System.Diagnostics;
using System.Security.Cryptography;

class scaled_random_uint
{
    private static ulong buf = 0;              // 64 random bits buffer
    private static int av = 0;                 // available bits in buf
    private static uint g_u = 0;               // copy of u
    private static int g_nd = 0;               // nd (needed) bits = highest set bit of u
    private static int g_s = 0;                // shift
    private static bool ones = false;          // true if u = 10, 110, 1110, ...

    private static uint g_uint_c(uint u)
    {
        uint x; int s;
        if (u == 0) return 0;
        if (u != g_u)
        {
            g_u = u;
            g_nd = nob(u);
            g_s = 64 - g_nd;
            ones = (u + 1 & u + 2) == 0;
        }
    L0: if (av < g_nd) 
        {
            av += 32;
            buf <<= 32;
            buf |= next32();
        }
        x = (uint)buf << g_s >> g_s;
        buf >>= g_nd;
        av -= g_nd;
        if (x <= u) return x;
        if (ones) goto L0;                     // faster when u = 2, 6, 14, ...
        s = nob(x ^ u) - 1;
        buf <<= s;
        buf ^= x;
        av += s;
        goto L0;
    }

    private static RNGCryptoServiceProvider
        rng = new RNGCryptoServiceProvider();
    private static byte[] r08 = new byte[40];       //  ... [400] ... ~15% faster
    private static uint[] r32 = new uint[10];       //  ... [100] ...  ,,    ,,
    private static int i_r32 = 10;                  //  ... = 100 ...  ,,    ,, 
                                                               
    private static uint next32()                                  
    {                                                             
        if (i_r32 != 10) return r32[i_r32++];       //  ... != 100...  ,,    ,,
        i_r32 = 1;                                                
        rng.GetBytes(r08);                                       
        Buffer.BlockCopy(r08, 0, r32, 0, 40);       //  ... , 400)...  ,,    ,,
        return r32[0];
    }

    private static int nob(uint u) // number of bits, position of the highest set bit, u > 0  
    {
        return
        u < 1u << 16 ? u < 1u << 08 ? u < 1u << 04 ? u < 1u << 02 ? u < 1u << 01 ? 01 : 02 :
                                                                    u < 1u << 03 ? 03 : 04 :
                                                     u < 1u << 06 ? u < 1u << 05 ? 05 : 06 :
                                                                    u < 1u << 07 ? 07 : 08 :
                                      u < 1u << 12 ? u < 1u << 10 ? u < 1u << 09 ? 09 : 10 :
                                                                    u < 1u << 11 ? 11 : 12 :
                                                     u < 1u << 14 ? u < 1u << 13 ? 13 : 14 :
                                                                    u < 1u << 15 ? 15 : 16 :
                       u < 1u << 24 ? u < 1u << 20 ? u < 1u << 18 ? u < 1u << 17 ? 17 : 18 :
                                                                    u < 1u << 19 ? 19 : 20 :
                                                     u < 1u << 22 ? u < 1u << 21 ? 21 : 22 :
                                                                    u < 1u << 23 ? 23 : 24 :
                                      u < 1u << 28 ? u < 1u << 26 ? u < 1u << 25 ? 25 : 26 :
                                                                    u < 1u << 27 ? 27 : 28 :
                                                     u < 1u << 30 ? u < 1u << 29 ? 29 : 30 :
                                                                    u < 1u << 31 ? 31 : 32;
    }

    private static Stopwatch sw = new Stopwatch();
    static void Main()
    {
        speed((1u << 31) - 0); 
        Console.ReadLine();
    }

    private static void speed(uint u)
    {
        int i, j = 10000000; double t;
        sw.Restart();
        for (i = 0; i < j; i++) g_uint_c(u);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (i = 0; i < j; i++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.WriteLine("u: " + u);
        Console.WriteLine("gen_uint(u)");
        Console.WriteLine("     ms: " + t);
        Console.WriteLine("   bits: " + (double)(nob(u)) * j);
        Console.WriteLine(" bits/s: {0:.000}", (double)((nob(u)) * j) / t / 1000000);
        Console.WriteLine();
    }
}

2013/11/04

Fast Cube Root ( < 2^32 ), part 2

/*
The mean time to take a cube root from a 32 bits uint was: < 13 ns
                                            new mean time: < 12 ns

It is a bit faster, especially for large values, because variables are initialized properly.
       
       |--------------------------------|
       |         cro32(x)               |       
       |-----|-----||------------|------|       
       |   x |  ns ||          x |   ns |       
       |-----|-----||------------|------|
       |   0 | 3,0 ||        511 |  4,6 |       
       |   1 | 2,3 ||       1023 |  5,3 |       
       |   3 | 2,3 ||       2047 |  5,6 |               ___
       |   7 | 2,3 ||       4095 |  5,0 |           \3 /                
       |  15 | 3,3 ||      32768 |  7,6 |            \/  x = y          
       |  31 | 2,6 ||    1048576 |  9,0 |          
       |  63 | 2,6 ||  268435456 | 11,0 |           Input: 0 <= x < 2^32
       |  64 | 5,0 ||  536870912 | 11,3 |          Output: y, such that y^3 <= x < (y+1)^3
       |  65 | 5,0 || 1070599167 | 13,0 |       
       | 100 | 5,0 || 1070599168 | 13,0 |
       | 127 | 5,0 || 1073741824 | 11,3 |
       | 128 | 5,3 || 1073774592 | 11,3 |
       | 255 | 4,6 || 2147483648 | 11,0 |
       | 256 | 4,6 || 4294967295 | 11,7 |
       |-----|-----||------------|------|
       
       4294967295 roots in 50686 ms, mean time 11,80 ns (before 12,55 ns)
       
Another way to find roots is using a look-up table. Below "fill_cubes" fills a "cubes" array,
"cro32t" does a binary search for a root. Mean time: 30 ns.
I have not tried to optimize it further, I see no reason why it should be much faster.
*/

using System;
using System.Diagnostics;
class cube_root
{
    private static uint cro32(uint x)
    {
        uint y = 4u, z = 16u, b = 61u << 21;
        if (x < 1u << 24)
            if (x < 1u << 12)
                if (x < 1u << 06)
                    if (x < 1u << 03)
                        return x == 0u ? 0u : 1u;
                    else
                        return x < 27u ? 2u : 3u;
                else
                    if (x < 1u << 09) goto L8; else goto L7;
            else
                if (x < 1u << 18)
                    if (x < 1u << 15) goto L6; else goto L5;
                else
                    if (x < 1u << 21) goto L4; else goto L3;
        else
            if (x < 1u << 30)
                if (x < 1u << 27) goto L2;
                else
                {
                    if (x >= 27u << 24) { x -= 27u << 24; z = 36u; y = 6u; b = 127u << 21; }
                    else { x -= 1u << 27; }
                }
            else
            {
                if (x >= 27u << 27) { x -= 27u << 27; z = 144u; y = 12u; b = 469u << 21; }
                else
                {
                    if (x >= 125u << 24) { x -= 125u << 24; z = 100u; y = 10u; b = 331u << 21; }
                    else { x -= 1u << 30; z = 64u; y = 8u; b = 217u << 21; }
                }
            }
        goto M1;

    L2: if (x >= 27u << 21) { x -= 27u << 21; z = 36u; y = 6u; } else { x -= 1u << 24; } goto M2;
    L3: if (x >= 27u << 18) { x -= 27u << 18; z = 36u; y = 6u; } else { x -= 1u << 21; } goto M3;
    L4: if (x >= 27u << 15) { x -= 27u << 15; z = 36u; y = 6u; } else { x -= 1u << 18; } goto M4;
    L5: if (x >= 27u << 12) { x -= 27u << 12; z = 36u; y = 6u; } else { x -= 1u << 15; } goto M5;
    L6: if (x >= 27u << 09) { x -= 27u << 09; z = 36u; y = 6u; } else { x -= 1u << 12; } goto M6;
    L7: if (x >= 27u << 06) { x -= 27u << 06; z = 36u; y = 6u; } else { x -= 1u << 09; } goto M7;
    L8: if (x >= 27u << 03) { x -= 27u << 03; z = 36u; y = 6u; } else { x -= 1u << 06; } goto M8;

    M1: if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M2: b = (y + z) * 3 + 1u << 18; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M3: b = (y + z) * 3 + 1u << 15; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M4: b = (y + z) * 3 + 1u << 12; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M5: b = (y + z) * 3 + 1u << 09; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M6: b = (y + z) * 3 + 1u << 06; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M7: b = (y + z) * 3 + 1u << 03; if (x >= b) { x -= b; z += y * 2 + 1u; y += 1u; } y *= 2; z *= 4;
    M8: return x <= (y + z) * 3 ? y : y + 1u;
    }

    private static uint[] cubes = new uint[2048];
    private static void fill_cubes()
    {
        int i = 0; uint a = 0u, b = 1u, c = 6u;
        do
        {
            cubes[i++] = a;
            a += b; b += c; c += 6u;
        }
        while (i < 1626);
        do cubes[i++] = ~0u;
        while (i < 2048);
    }
    private static uint cro32t(uint x)
    {
        uint i = 1u << 10, j = 1u << 9, u = 1u << 30;
        for (; j > 0; j >>= 1)
        {
            if (x < u) i -= j;
            else if (x > u) i += j;
            else return i < 1625u ? i : 1625u;
            u = cubes[i];
        }
        return x < u ? i - 1 : i;
    }

    private static Stopwatch sw = new Stopwatch();
    static void Main(string[] args)
    {
        cro32_all();        // ~1 minute, comment out "cro32t" , uint[] cubes, etc.
                            //  Or try seperate executables.

        fill_cubes();
        cro32t_all();       // ~2 minutes
        Console.ReadLine();
    }
    private static void cro32_all()
    {
        uint x; double t;
        cro32(~0u);
        sw.Restart();
        for (x = 0; x < ~0u; x++) cro32(x);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (x = 0; x < ~0u; x++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.Write(x + " roots in " + t + " ms, ");
        Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x);
        // 4294967295 roots in 50686 ms, mean time 11,80 ns
    }
    private static void cro32t_all()
    {
        uint x; double t;
        cro32t(~0u);
        sw.Restart();
        for (x = 0; x < ~0u; x++) cro32t(x);
        sw.Stop();
        t = sw.ElapsedMilliseconds;
        sw.Restart();
        for (x = 0; x < ~0u; x++) ; // nada
        sw.Stop();
        t -= sw.ElapsedMilliseconds;
        Console.Write(x + " roots in " + t + " ms, ");
        Console.WriteLine("mean time {0:.00} ns", t * 1000000 / x);
        // 4294967295 roots in 129254 ms, mean time 30,09 ns
    }
}