[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [computer-go] Designing faster, better influence functions
> I don't think I would want to get into using CPU-specific instructions.
But
> I must say I've sometimes thought about what could be useful to have in
> terms of Go-specific hardware. In Tsume-Goliath I think I found that the
> single most expensive function was getNlib(), which would retrieve the
> number of liberties of a chain at a certain coordinate and put the
> coordinates of those liberties in a little list, with a maximum of 'n'
> liberties. A very simple and basic function, used in a lot of different
> places. The total time spent in this one function was something like 20 or
> 30%.
I noticed that those functions take a lot of time too so I designed a
(perhaps slighty buggy :) MMX implementation:
It takes a chain and a map of empty board points and returns a liberty map.
It takes around 175 machine instructions.
I find this unacceptably slow but have not had the time to think about this
further.
(*
function GetLiberties(stringLibs {eax}, stringStones {edx}, EmptyPoints
{ecx}: PS64): S32 {eax}; register;
{ TODO : optimize better for pipelining & pairing & prefetching & cache
alignment (32 bytes) & multithreading} // prefetchnta instruction, nop's
when not paired // buggy on some OSes!?
const
const_0f: S64 = $0f0f0f0f0f0f0f0f;
const_33: S64 = $3333333333333333;
const_55: S64 = $5555555555555555;
asm
movq mm0, [edx] // stones 1
movq mm1, [edx + 8] // stones 2
movq mm2, [edx + 48] // stones 7
movq mm3, mm0
movq mm4, mm0
psllq mm3, 1
psrlq mm4, 1
psllq mm2, 20
por mm3, mm1
movq mm7, [ecx] // empty 1
por mm3, mm4
movq mm4, mm1
por mm2, mm3
movq mm3, mm1
pand mm7, mm2
movq mm2, [edx + 16] // stones 3
movq [eax], mm7 // libs 1
psllq mm3, 1
psrlq mm4, 1
por mm3, mm4
movq mm7, [ecx + 8] // empty 2
por mm3, mm0
movq mm5, [edx + 24] // stones 4
por mm3, mm2
movq mm4, mm2
pand mm7, mm3
movq mm3, mm2
movq [eax + 8], mm7 // libs 2
psllq mm3, 1
psrlq mm4, 1
movq mm7, [ecx + 16] // empty 3
por mm3, mm4
movq mm0, [edx + 32] // stones 5
por mm3, mm5
movq mm4, mm5
por mm3, mm1
psrlq mm4, 1
pand mm7, mm3
movq mm3, mm5
movq [eax + 16], mm7 // libs 3
psllq mm3, 1
movq mm7, [ecx + 24] // empty 4
por mm3, mm4
movq mm1, [edx + 40] // stones 6
por mm3, mm0
movq mm4, mm0
por mm2, mm3
movq mm3, mm0
pand mm7, mm2
psllq mm3, 1
psrlq mm4, 1
movq [eax + 24], mm7 // libs 4
por mm3, mm4
movq mm7, [ecx + 32] // empty 5
por mm3, mm1
movq mm2, [edx + 48] // stones 7
por mm5, mm3
movq mm3, mm1
pand mm7, mm5
movq mm4, mm1
movq [eax + 32], mm7 // libs 5
psllq mm3, 1
psrlq mm4, 1
movq mm7, [ecx + 40] // empty 6
por mm3, mm4
movq mm5, [edx]
por mm3, mm2
movq mm4, mm2
por mm0, mm3
movq mm3, mm2
pand mm7, mm0
psllq mm3, 1
psrlq mm4, 1
movq [eax + 40], mm7 // libs 6
por mm3, mm4
movq mm7, [ecx + 48] // empty 7
por mm3, mm1
psrlq mm5, 20;
movq mm0, [eax]
por mm3, mm5
movq mm1, [eax + 8]
pand mm7, mm3
movq mm2, [eax + 16]
movq [eax + 48], mm7 // libs 7
movq mm3, [eax + 24]
movq mm4, [eax + 32]
movq mm5, [eax + 40]
movq mm6, mm0
pand mm0, mm1
pxor mm1, mm6
movq mm6, mm1
pand mm1, mm2
pxor mm2, mm6
por mm0, mm1
movq mm6, mm3
pand mm3, mm4
pxor mm4, mm6
movq mm6, mm4
pand mm4, mm5
pxor mm5, mm6
por mm3, mm4
movq mm6, mm2
pand mm2, mm5
pxor mm5, mm6
movq mm6, mm5
pand mm5, mm7
pxor mm7, mm6
por mm2, mm5
movq mm6, mm0
pand mm0, mm3
pxor mm3, mm6
movq mm6, mm3
pand mm3, mm2
pxor mm2, mm6
por mm0, mm3
movq mm6, mm7
psrld mm7, 1
pand mm7, const_55
psubd mm6, mm7
movq mm7, mm6
psrld mm6, 2
pand mm7, const_33
pand mm6, const_33
paddd mm7, mm6
movq mm6, mm7
psrld mm7, 4
paddd mm7, mm6
pand mm7, const_0f
pxor mm6, mm6
psadbw mm7, mm6
movq mm6, mm2
psrld mm2, 1
pand mm2, const_55
psubd mm6, mm2
movq mm2, mm6
psrld mm6, 2
pand mm2, const_33
pand mm6, const_33
paddd mm2, mm6
movq mm6, mm2
psrld mm2, 4
paddd mm2, mm6
pand mm2, const_0f
pxor mm6, mm6
psadbw mm2, mm6
paddd mm2, mm2
movq mm6, mm0
psrld mm0, 1
pand mm0, const_55
psubd mm6, mm0
movq mm0, mm6
psrld mm6, 2
pand mm0, const_33
pand mm6, const_33
paddd mm0, mm6
movq mm6, mm0
psrld mm0, 4
paddd mm0, mm6
pand mm0, const_0f
pxor mm6, mm6
psadbw mm0, mm6
psllq mm0, 2
paddd mm0, mm7
paddd mm0, mm2
movd eax, mm0
emms { TODO : speedup }
end;
*)
I am still not happy with it and hope that newer CPU's will have a
population count instruction (count the nr. of bits in a register).
_______________________________________________
computer-go mailing list
computer-go@xxxxxxxxxxxxxxxxx
http://www.computer-go.org/mailman/listinfo/computer-go/