/* * Copyright (C) 2007 Michael Lewis * Author: Mike Lewis * * This work is provide AS IS, and has no warranty. * The author is NOT responsible for anything that happens * due to use of this code, either using it or running it on * your system */ #ifdef __SSE2__ #include #include #endif #include #include #include #include "types.h" #include "helpers.h" #include "block.h" void cycle_block( llife_block block, llife_block tl, llife_block t, llife_block tr, llife_block l, llife_block r, llife_block bl, llife_block b, llife_block br ) { int i,j; //const __m128i onemask = _mm_set1_epi8( 0x01 ); const __m128i populate = _mm_set1_epi8( 3 ); __m128i top_left, top, top_right; __m128i left, self, right; __m128i bot_left, bot, bot_right; __m128i itemp; top_left = top = top_right = self = _mm_setzero_si128(); if( t ) { self = _mm_load_si128( (*t->current) + 127 ); left = shift_pack_right( self ); right = shift_pack_left( self ); } if( tl ) { itemp = _mm_load_si128( (*tl->current) + 127 ); left = _mm_or_si128( shift_pack_left127(itemp ), left ); } if( tr ) { itemp = _mm_load_si128( (*tr->current) + 127 ); right = _mm_or_si128( shift_pack_right127( itemp ), right ); } bot = _mm_load_si128( (*block->current) + 0 ); bot_left = shift_pack_right( bot ); bot_right = shift_pack_left( bot ); if( l ) { itemp = _mm_load_si128( (*l->current) + 0 ); bot_left = _mm_or_si128( shift_pack_left127( itemp ), bot_left ); } if( r ) { itemp = _mm_load_si128( (*r->current) + 0 ); bot_right = _mm_or_si128( shift_pack_right127( itemp ), bot_right ); } for( i = 0; i < 128; i++ ) { top_left = left; top = self; top_right = right; left = bot_left; self = bot; right = bot_right; if( i < 127 ) { bot = _mm_load_si128( (*block->current) + i + 1 ); bot_left = shift_pack_right( bot ); bot_right = shift_pack_left( bot ); if( l ) { itemp = _mm_load_si128( (*l->current) + i + 1 ); bot_left = _mm_or_si128( shift_pack_left127( itemp ), bot_left ); } if( r ) { itemp = _mm_load_si128( (*r->current) + i + 1 ); bot_right = _mm_or_si128( shift_pack_right127( itemp ), bot_right ); } } else { bot = _mm_setzero_si128(); if( b ) { bot = (*b->current)[0]; bot_left = shift_pack_right( bot ); bot_right = shift_pack_left( bot ); } if( bl ) { bot_left = _mm_or_si128( shift_pack_left127( (*bl->current)[0] ), bot_left ); } if( br ) { bot_right = _mm_or_si128( shift_pack_right127( (*br->current)[0] ), bot_right ); } } __m128i temp = _mm_setzero_si128(); for( j = 0; j < 8; j++ ) { __m128i _top_left = unpack(top_left, j); __m128i _top = unpack( top, j ); __m128i sum1 = _mm_add_epi8( _top_left, _top ); __m128i _top_right = unpack( top_right, j ); __m128i _right = unpack ( right, j ); __m128i sum2 = _mm_add_epi8( _top_right, _right ); sum1 = _mm_add_epi8( sum1, sum2 ); __m128i _bot = unpack( bot, j ); __m128i _bot_right = unpack( bot_right, j ); __m128i sum4 = _mm_add_epi8( _bot, _bot_right ); __m128i _bot_left = unpack( bot_left, j ); __m128i _left = unpack( left, j ); __m128i sum3 = _mm_add_epi8( _left, _bot_left ); sum3 = _mm_add_epi8( sum3, sum4 ); sum3 = _mm_add_epi8( sum1, sum3 ); sum1 = _mm_cmpeq_epi8( sum3, populate ); __m128i _self = unpack( self, j ); sum3 = _mm_add_epi8( _self, sum3 ); sum3 = _mm_cmpeq_epi8( sum3, populate ); sum3 = _mm_or_si128( sum1, sum3 ); temp = _mm_or_si128( temp, pack( sum3, j ) ); //temp = _mm_or_si128( pack( unpack( bot_right, j ), j ), temp ); } _mm_store_si128( (*block->working) + i, temp ); } } void worldCycle( llife_world world ) { int i, j; //#pragma omp parallel private(i) //#pragma omp for schedule(static) for( i = 0; i < world->width; i++ ) { for( j = 0; j < world->height; j++ ) { llife_block center = get_life_block( world, j, i); if( center ) { llife_block tl, t, tr, l, r, bl, b, br; tl = i > 0 && j > 0 ? get_life_block( world, j-1, i-1 ) : NULL; tr = i < world->width-1 && j > 0 ? get_life_block( world, j-1, i+1 ) : NULL; t = j > 0 ? get_life_block( world, j-1, i ) : NULL; l = i > 0 ? get_life_block( world, j, i-1 ) : NULL; r = i < world->width-1 ? get_life_block( world, j, i+1 ) : NULL; bl = i > 0 && j < world->height-1 ? get_life_block( world, j+1, i-1 ) : NULL; br = i < world->width-1 && j < world->height-1 ? get_life_block( world, j+1, i+1 ) : NULL; b = j < world->height-1 ? get_life_block( world, j+1, i ) : NULL; cycle_block( center, tl, t, tr, l, r, bl, b, br ); } } } for( i = 0; i < world->width * world->height; i++ ) { llife_block center = world->blocks[i]; data_block * mtemp = center->current; center->current = center->working; center->working = mtemp; } } //returns NULL if fail; life_block *allocate_block() { //We're going to do two allocs so our data is alligned llife_block lb = calloc( 1, sizeof( life_block ) ); if( !lb ) { return 0; } //Allocate both of the buffers in 1 chunk. It will make life easier lb->allocated = (data_block*)calloc( 1, sizeof( data_block ) * 2 + 0x7F );// calloc( 2, sizeof( __m128i ) * 128 ); if( !lb->allocated ) { free( lb ); return 0; } lb->current = (data_block*)((((unsigned long)lb->allocated) + 0x7F) & ~0x7F); lb->working = &(lb->current)[1]; return lb; } void free_block( life_block *block ) { if( !block ) { return; } free( block->allocated ); free( block ); } llife_world allocate_world( int height, int width ) { llife_world world = malloc( sizeof( life_world ) ); world->blocks = calloc( width * height, sizeof( llife_block ) ); int i; for( i = 0; i < width * height; i++ ) { llife_block blk = allocate_block(); if( !blk ) { free_world( world ); return 0; } world->blocks[i] = blk; } world->width = width; world->height = height; world->origin_x = world->origin_y = 0; return world; } void free_world( llife_world world ) { if( !world ) { return; } int i; for( i = 0; i < world->height * world->width; i++ ) { free_block( world->blocks[i] ); } free( world->blocks ); } void set_bit_in_world( llife_world world, int row, int column ) { int block_column = column / 128; int block_row = row / BLOCK_HEIGHT; int inner_column = column % 128; int inner_row = row % BLOCK_HEIGHT; llife_block blk = get_life_block( world, block_row, block_column ); set_bit_in_block( blk, inner_row, inner_column ); } // XX d3 d2 a3 a2 a1 b3 b2 b1 c3 c2 c1 // a3 a2 a1 b3 b2 b1 c3 c2 d1 d3 d2 d1 // b3 b2 b1 c3 c2 c1 d3 d2 d1 a2 a1 XX