/* find the bits set in a word (64-bit only) */

#include <stdio.h>
#include "../cputimestamps/getcycles.h"

#define GOS 100000000UL

/* (current) gcc method for generic implementation of popcount */
const int count_table[256] =
{
    0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
};

static inline unsigned long countit_table(unsigned long x) {
  int i, ret = 0;

  for (i = 0; i < 64; i += 8)
    ret += count_table[(x >> i) & 0xff];

  return ret;
}

/* Suggested by "Algorithms for programmers" Jorg Arndt
   http://www.jjj.de/fxt
   Similar thing seen in the AMD performance tuning handbook */
static inline unsigned long countit_shift(unsigned long x)
{
	x = ((x>>1) & 0x5555555555555555) + (x & 0x5555555555555555);
	x = ((x>>2) & 0x3333333333333333) + (x & 0x3333333333333333);
	x = ((x>>4) + x) & 0x0f0f0f0f0f0f0f0f;
	x += x>> 8;
	x += x>>16;
	x += x>>32;
	return x & 0xff;
}

int main(void)
{
	cycle_t cycles = 0;
	unsigned long r=0,i;

	for(i=0; i < GOS; i++)
	{
		cycle_t start = get_cycles();
		/* falls through to instructions where it can */
		r += __builtin_popcountl(i);
		cycles += (get_cycles() - start);
	}
	printf("built-in average is %ld (%ld)\n", cycles/GOS, r);
	r = 0;

	for(i=0; i < GOS; i++)
	{
		cycle_t start = get_cycles();
		r += countit_table(i);
		cycles += (get_cycles() - start);
	}
	printf("table average is %ld (%ld)\n", cycles/GOS, r);
	r = 0;

	for(i=0; i < GOS; i++)
	{
		cycle_t start = get_cycles();
		r += countit_shift(i);
		cycles += (get_cycles() - start);
	}
	printf("shift average is %ld (%ld)\n", cycles/GOS, r);

	return 0;
}
