Run Code
|
API
|
Code Wall
|
Misc
|
Feedback
|
Login
|
Theme
|
Privacy
|
Patreon
avx2 optimized pcg32 random number generator
#pragma GCC push_options #pragma GCC target "avx2" #include <stdint.h> #include "immintrin.h" #define PCG_DEFAULT_MULTIPLIER_64 6364136223846793005ULL #define PCG_DEFAULT_INCREMENT_64 1442695040888963407ULL #define PCG_DEFAULT_MULTIPLIER_32 747796405U #define PCG_DEFAULT_INCREMENT_32 2891336453U #define PCG_128BIT_CONSTANT(high,low) \ ((((__int128_t)high) << 64) + low) #define PCG_DEFAULT_MULTIPLIER_128 \ PCG_128BIT_CONSTANT(2549297995355413924ULL,4865540595714422341ULL) #define advance_inc0i(cur_mult0, cur_plus0, acc_mult0, acc_plus0) (acc_plus0) #define advance_inc1i(cur_mult1, cur_plus1, acc_mult1, acc_plus1) \ advance_inc0i((cur_mult1) * (cur_mult1), ((cur_mult1) + 1)*(cur_plus1), \ (acc_mult1) * (cur_mult1), (acc_plus1) * (cur_mult1) + (cur_plus1)) #define advance_inc2i(cur_mult2, cur_plus2, acc_mult2, acc_plus2) \ advance_inc1i((cur_mult2) * (cur_mult2), ((cur_mult2) + 1)*(cur_plus2), (acc_mult2), (acc_plus2) ) #define advance_inc3i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_inc1i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), \ (acc_mult) * (cur_mult), (acc_plus) * (cur_mult) + (cur_plus)) #define advance_inc4i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_inc2i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), (acc_mult), (acc_plus) ) #define advance_inc5i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_inc2i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), \ (acc_mult) * (cur_mult), (acc_plus) * (cur_mult) + (cur_plus)) #define advance_inc6i(cur_mult6, cur_plus6, acc_mult6, acc_plus6) \ advance_inc3i((cur_mult6) * (cur_mult6), ((cur_mult6) + 1)*(cur_plus6), (acc_mult6), (acc_plus6) ) #define advance_inc7i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_inc3i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), \ (acc_mult) * (cur_mult), (acc_plus) * (cur_mult) + (cur_plus)) #define advance_inc8i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_inc4i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), (acc_mult), (acc_plus) ) #define advance_incm(delta, cur_mult, cur_plus) \ advance_inc ## delta ##i(cur_mult, cur_plus, 1u, 0u) #define advance_mul0i(cur_mult0, cur_plus0, acc_mult0, acc_plus0) (acc_mult0) #define advance_mul1i(cur_mult1, cur_plus1, acc_mult1, acc_plus1) \ advance_mul0i((cur_mult1) * (cur_mult1), ((cur_mult1) + 1)*(cur_plus1), \ (acc_mult1) * (cur_mult1), (acc_plus1) * (cur_mult1) + (cur_plus1)) #define advance_mul2i(cur_mult2, cur_plus2, acc_mult2, acc_plus2) \ advance_mul1i((cur_mult2) * (cur_mult2), ((cur_mult2) + 1)*(cur_plus2), (acc_mult2), (acc_plus2) ) #define advance_mul3i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_mul1i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), \ (acc_mult) * (cur_mult), (acc_plus) * (cur_mult) + (cur_plus)) #define advance_mul4i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_mul2i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), (acc_mult), (acc_plus) ) #define advance_mul5i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_mul2i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), \ (acc_mult) * (cur_mult), (acc_plus) * (cur_mult) + (cur_plus)) #define advance_mul6i(cur_mult6, cur_plus6, acc_mult6, acc_plus6) \ advance_mul3i((cur_mult6) * (cur_mult6), ((cur_mult6) + 1)*(cur_plus6), (acc_mult6), (acc_plus6) ) #define advance_mul7i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_mul3i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), \ (acc_mult) * (cur_mult), (acc_plus) * (cur_mult) + (cur_plus)) #define advance_mul8i(cur_mult, cur_plus, acc_mult, acc_plus) \ advance_mul4i((cur_mult) * (cur_mult), ((cur_mult) + 1)*(cur_plus), (acc_mult), (acc_plus) ) #define advance_mulm(delta, cur_mult, cur_plus) \ advance_mul ## delta ##i(cur_mult, cur_plus, 1u, 0u) inline uint64_t pcg_rotr_64(uint64_t value, unsigned int rot) { #if 0 && PCG_USE_INLINE_ASM && __clang__ && __x86_64__ // For whatever reason, clang actually *does* generate rotq by // itself, so we don't need this code. asm ("rorq %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); return value; #else return (value >> rot) | (value << ((- rot) & 63)); #endif } struct pcg_state_mcg_128_xsl_rr_64 { __int128_t _state; pcg_state_mcg_128_xsl_rr_64(__int128_t seed = 42ULL) : _state(0ULL) { _state = oneseq_step(_state); _state += seed; _state = oneseq_step(_state); }; static __int128_t oneseq_step(__int128_t state) { return state * PCG_DEFAULT_MULTIPLIER_128; } uint64_t output(__int128_t state) const { uint64_t word = ((state >> ((state >> 59u) + 5u)) ^ state)* 12605985483714917081ull; return (word >> 43u) ^ word; } uint64_t rand() { __int128_t oldstate = _state; _state = oneseq_step(_state); return output(oldstate); } void fill(uint32_t *buffer, const unsigned bytes) { __int128_t lcgstate = _state; for (unsigned id = 0; id < bytes/sizeof(uint32_t); id+=2) { __int128_t oldstate = lcgstate; lcgstate = oneseq_step(lcgstate); uint64_t rnd = output(oldstate); buffer[id] = rnd; buffer[id+1] = rnd >> 32; } _state = lcgstate; } }; struct pcg_state_64_rxs_m_xs_64 { uint64_t _state; pcg_state_64_rxs_m_xs_64(uint64_t seed = 42UL) : _state(0UL) { _state = oneseq_step(_state); _state += seed; _state = oneseq_step(_state); }; static uint64_t oneseq_step(uint64_t state) { return state * PCG_DEFAULT_MULTIPLIER_64 + PCG_DEFAULT_INCREMENT_64; } uint64_t output(__int128_t state) const { return pcg_rotr_64(((uint64_t)(state >> 64u)) ^ (uint64_t)state, state >> 122u); } uint64_t rand() { __int128_t oldstate = _state; _state = oneseq_step(_state); return output(oldstate); } void fill(uint32_t *buffer, const unsigned bytes) { __int128_t lcgstate = _state; for (unsigned id = 0; id < bytes/sizeof(uint32_t); id+=2) { __int128_t oldstate = lcgstate; lcgstate = oneseq_step(lcgstate); uint64_t rnd = output(oldstate); buffer[id] = rnd; buffer[id+1] = rnd >> 32; } _state = lcgstate; } }; template<bool fillavx2> struct pcg_state_32 { uint32_t _state; pcg_state_32(uint32_t seed = 42U) : _state(0U) { _state = oneseq_step(_state); _state += seed; _state = oneseq_step(_state); }; static uint32_t oneseq_step(uint32_t state) { return state * PCG_DEFAULT_MULTIPLIER_32 + PCG_DEFAULT_INCREMENT_32; } uint32_t output(uint32_t state) const { uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u; return (word >> 22u) ^ word; } uint32_t rand() { uint32_t oldstate = _state; _state = oneseq_step(_state); return output(oldstate); } void fill(uint32_t *buffer, const unsigned bytes) { if (fillavx2) return fillavx(buffer, bytes); else return fillscalar(buffer, bytes); } void fillscalar(uint32_t *buffer, const unsigned bytes) { uint32_t lcgstate = _state; for (unsigned id = 0; id < bytes/sizeof(uint32_t); id++) { uint32_t oldstate = lcgstate; lcgstate = oneseq_step(lcgstate); buffer[id] = output(oldstate); } _state = lcgstate; } __attribute__ ((target("avx2"))) void fillavx(uint32_t *buffer, const unsigned bytes) { __m256i lcgstate = _mm256_set1_epi32(_state); const __m256i pcg_advance_mul = _mm256_set_epi32( advance_mulm(7, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(6, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(5, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(4, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(3, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(2, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(1, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(0, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32) ); const __m256i pcg_advance_inc = _mm256_set_epi32( advance_incm(7, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(6, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(5, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(4, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(3, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(2, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(1, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(0, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32) ); lcgstate = _mm256_mullo_epi32(lcgstate, pcg_advance_mul); lcgstate = _mm256_add_epi32(lcgstate, pcg_advance_inc); const __m256i pcg_mul = _mm256_set1_epi32 (advance_mulm(8, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32)); const __m256i pcg_add = _mm256_set1_epi32 (advance_incm(8, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32)); const uint32_t shiftbits = 28u; const __m256i shiftadd = _mm256_set1_epi32(4); const uint32_t wordshift = 22u; const __m256i mul = _mm256_set1_epi32(277803737u); union { uint32_t *b; __m256i *v; } ptrconvert; ptrconvert.b = buffer; for (unsigned id = 0; id < bytes/sizeof(__m256i); id++) { __m256i oldstate = lcgstate; /* Advance lcg state */ lcgstate = _mm256_mullo_epi32(lcgstate, pcg_mul); lcgstate = _mm256_add_epi32(lcgstate, pcg_add); /* rcs_m_xs output function */ __m256i shift = _mm256_srli_epi32(oldstate, shiftbits); shift = _mm256_add_epi32(shift, shiftadd); __m256i word = _mm256_srlv_epi32(oldstate, shift); word = _mm256_xor_si256(word, oldstate); word = _mm256_mullo_epi32(word, mul); __m256i random = _mm256_xor_si256(word,_mm256_srli_epi32(word, wordshift)); ptrconvert.v[id] = random; } _state = _mm256_extract_epi32(lcgstate, 0); } }; template<size_t ncache> struct pcg_state_32_simd { __m256i _lcg; union { __m256i _vcache[ncache]; uint32_t _icache[ncache*sizeof(__m256i)/sizeof(uint32_t)]; }; unsigned _id; const unsigned _max = ncache*sizeof(__m256i)/sizeof(uint32_t); pcg_state_32_simd(uint32_t seed = 42U) : _id(_max) { uint32_t state = 0U; state = oneseq_step(state); state += seed; state = oneseq_step(state); _lcg = _mm256_set1_epi32(state); const __m256i pcg_advance_mul = _mm256_set_epi32( advance_mulm(7, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(6, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(5, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(4, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(3, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(2, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(1, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_mulm(0, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32) ); const __m256i pcg_advance_inc = _mm256_set_epi32( advance_incm(7, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(6, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(5, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(4, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(3, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(2, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(1, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32), advance_incm(0, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32) ); _lcg = _mm256_mullo_epi32(_lcg, pcg_advance_mul); _lcg = _mm256_add_epi32(_lcg, pcg_advance_inc); }; static uint32_t oneseq_step(uint32_t state) { return state * PCG_DEFAULT_MULTIPLIER_32 + PCG_DEFAULT_INCREMENT_32; } uint32_t rand() { if (_id >= _max) { _id = 0; fill(_icache, _max * sizeof(uint32_t)); } return _icache[_id++]; } __attribute__ ((target("avx2"))) void fill(uint32_t *buffer, const unsigned bytes) { const __m256i pcg_mul = _mm256_set1_epi32 (advance_mulm(8, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32)); const __m256i pcg_add = _mm256_set1_epi32 (advance_incm(8, PCG_DEFAULT_MULTIPLIER_32, PCG_DEFAULT_INCREMENT_32)); const uint32_t shiftbits = 28u; const __m256i shiftadd = _mm256_set1_epi32(4); const uint32_t wordshift = 22u; const __m256i mul = _mm256_set1_epi32(277803737u); union { uint32_t *b; __m256i *v; } ptrconvert; ptrconvert.b = buffer; __m256i lcgstate = _lcg; for (unsigned id = 0; id < bytes/sizeof(__m256i); id++) { __m256i oldstate = lcgstate; /* Advance lcg state */ lcgstate = _mm256_mullo_epi32(lcgstate, pcg_mul); lcgstate = _mm256_add_epi32(lcgstate, pcg_add); /* rcs_m_xs output function */ __m256i shift = _mm256_srli_epi32(oldstate, shiftbits); shift = _mm256_add_epi32(shift, shiftadd); __m256i word = _mm256_srlv_epi32(oldstate, shift); word = _mm256_xor_si256(word, oldstate); word = _mm256_mullo_epi32(word, mul); __m256i random = _mm256_xor_si256(word,_mm256_srli_epi32(word, wordshift)); ptrconvert.v[id] = random; } _lcg = lcgstate; } }; #include <x86intrin.h> #include <iostream> struct Timing { uint64_t min, max, count; double avg; Timing() : min(UINT64_MAX), max(0), count(0), avg(0) { } Timing &operator<<(uint64_t diff) { count++; max = std::max(max, diff); min = std::min(min, diff); double fdiff = (double)diff - avg; avg += fdiff/count; return *this; } struct Timer { Timing *_stats; uint64_t _begin; Timer(Timing *stats) : _stats(stats), _begin(__rdtsc()) { } ~Timer() { uint64_t end = __rdtsc(); *_stats << end - _begin; } }; }; #include <typeinfo> #include <string> #define rands 256U #define arrsize (32U*32U) #define repeat 200U template<typename rng> __attribute__((noinline)) void bench() { unsigned i, j, k; const std::string name(typeid(rng).name()); std::cout << "Running " << name << std::endl; /* test single value performance */ { uint32_t xored = 0; rng gen; Timing stats; for (i = 0; i < repeat; i++) { Timing::Timer t(&stats); for (j = 0; j < rands; j++) { xored ^= gen.rand(); } } std::cout << name << " single rand xor cycles per call " << " avg: " << stats.avg/rands << "(" << stats.avg << ")" << " min: " << (double)stats.min/rands << "(" << std::dec << stats.min << ")" << " max: " << (double)stats.max/rands << " xor: " << std::hex << xored << "\n"; } /* test single value performance */ { uint32_t xored = 0; const unsigned sizediv = 16U; __attribute__((aligned(32))) uint32_t buffer[arrsize/sizediv/sizeof(uint32_t)]; rng gen; Timing stats; for (i = 0; i < repeat; i++) { Timing::Timer t(&stats); for (j = 0; j < rands; j++) { gen.fill(buffer, arrsize/sizediv); for (k = 0; k < sizeof(buffer)/sizeof(buffer[0]); k++) xored ^= buffer[k]; } } std::cout << name << " fill " << std::dec << arrsize/sizediv << " bytes array bytes per cycle " << " avg: " << (arrsize*rands/sizediv)/stats.avg << "(" << stats.avg << ")" << " min: " << (double)(arrsize*rands/sizediv)/stats.min << "(" << std::dec << stats.min << ")" << " max: " << (double)(arrsize*rands/sizediv)/stats.max << " xor: " << std::hex << xored << "\n"; } /* test single value performance */ { uint32_t xored = 0; __attribute__((aligned(32))) uint32_t buffer[arrsize/sizeof(uint32_t)]; rng gen; Timing stats; for (i = 0; i < repeat; i++) { Timing::Timer t(&stats); for (j = 0; j < rands; j++) { gen.fill(buffer, arrsize); for (k = 0; k < sizeof(buffer)/sizeof(buffer[0]); k++) xored ^= buffer[k]; } } std::cout << name << " fill " << std::dec << arrsize << " bytes array bytes per cycle " << " avg: " << (arrsize*rands)/stats.avg << "(" << stats.avg << ")" << " min: " << (double)(arrsize*rands)/stats.min << "(" << std::dec << stats.min << ")" << " max: " << (double)(arrsize*rands)/stats.max << " xor: " << std::hex << xored << "\n"; } } int main(void) { bench<pcg_state_32<false>>(); bench<pcg_state_mcg_128_xsl_rr_64>(); bench<pcg_state_64_rxs_m_xs_64>(); bench<pcg_state_32_simd<1> >(); bench<pcg_state_32_simd<2> >(); bench<pcg_state_32_simd<4> >(); bench<pcg_state_32_simd<8> >(); bench<pcg_state_32_simd<16> >(); bench<pcg_state_32<true>>(); return 0; }
run
|
edit
|
history
|
help
0
replace_if-30-Seconds-of-C++
teste
Scemo
FindMedian
graph representation Adjacency List
11aa11
Complete Over-Use of Functor Templates (Academic Experiment)
GET ALL PRIME FACTORS OF A NUMBER
SubprogramModificat
Amisha