[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [computer-go] Designing faster, better influence functions



> I don't think I would want to get into using CPU-specific instructions.
But
> I must say I've sometimes thought about what could be useful to have in
> terms of Go-specific hardware. In Tsume-Goliath I think I found that the
> single most expensive function was getNlib(), which would retrieve the
> number of liberties of a chain at a certain coordinate and put the
> coordinates of those liberties in a little list, with a maximum of 'n'
> liberties. A very simple and basic function, used in a lot of different
> places. The total time spent in this one function was something like 20 or
> 30%.

I noticed that those functions take a lot of time too so I designed a
(perhaps slighty buggy :) MMX implementation:
It takes a chain and a map of empty board points and returns a liberty map.
It takes around 175 machine instructions.
I find this unacceptably slow but have not had the time to think about this
further.

(*
function GetLiberties(stringLibs {eax}, stringStones {edx}, EmptyPoints
{ecx}: PS64): S32 {eax}; register;
  { TODO : optimize better for pipelining & pairing & prefetching & cache
alignment (32 bytes) & multithreading}  // prefetchnta instruction, nop's
when not paired  // buggy on some OSes!?
const
  const_0f: S64 = $0f0f0f0f0f0f0f0f;
  const_33: S64 = $3333333333333333;
  const_55: S64 = $5555555555555555;

asm
  movq   mm0, [edx]         // stones 1
  movq   mm1, [edx + 8]     // stones 2
  movq   mm2, [edx + 48]    // stones 7
  movq   mm3, mm0
  movq   mm4, mm0
  psllq  mm3, 1
  psrlq  mm4, 1
  psllq  mm2, 20
  por    mm3, mm1
  movq   mm7, [ecx]         // empty 1
  por    mm3, mm4
  movq   mm4, mm1
  por    mm2, mm3
  movq   mm3, mm1
  pand   mm7, mm2
  movq   mm2, [edx + 16]    // stones 3
  movq   [eax], mm7         // libs 1
  psllq  mm3, 1
  psrlq  mm4, 1
  por    mm3, mm4
  movq   mm7, [ecx + 8]     // empty 2
  por    mm3, mm0
  movq   mm5, [edx + 24]    // stones 4
  por    mm3, mm2
  movq   mm4, mm2
  pand   mm7, mm3
  movq   mm3, mm2
  movq   [eax + 8], mm7     // libs 2
  psllq  mm3, 1
  psrlq  mm4, 1
  movq   mm7, [ecx + 16]    // empty 3
  por    mm3, mm4
  movq   mm0, [edx + 32]    // stones 5
  por    mm3, mm5
  movq   mm4, mm5
  por    mm3, mm1
  psrlq  mm4, 1
  pand   mm7, mm3
  movq   mm3, mm5
  movq   [eax + 16], mm7    // libs 3
  psllq  mm3, 1
  movq   mm7, [ecx + 24]    // empty 4
  por    mm3, mm4
  movq   mm1, [edx + 40]    // stones 6
  por    mm3, mm0
  movq   mm4, mm0
  por    mm2, mm3
  movq   mm3, mm0
  pand   mm7, mm2
  psllq  mm3, 1
  psrlq  mm4, 1
  movq   [eax + 24], mm7    // libs 4
  por    mm3, mm4
  movq   mm7, [ecx + 32]    // empty 5
  por    mm3, mm1
  movq   mm2, [edx + 48]    // stones 7
  por    mm5, mm3
  movq   mm3, mm1
  pand   mm7, mm5
  movq   mm4, mm1
  movq   [eax + 32], mm7    // libs 5
  psllq  mm3, 1
  psrlq  mm4, 1
  movq   mm7, [ecx + 40]    // empty 6
  por    mm3, mm4
  movq   mm5, [edx]
  por    mm3, mm2
  movq   mm4, mm2
  por    mm0, mm3
  movq   mm3, mm2
  pand   mm7, mm0
  psllq  mm3, 1
  psrlq  mm4, 1
  movq   [eax + 40], mm7    // libs 6
  por    mm3, mm4
  movq   mm7, [ecx + 48]    // empty 7
  por    mm3, mm1
  psrlq  mm5, 20;
  movq   mm0, [eax]
  por    mm3, mm5
  movq   mm1, [eax + 8]
  pand   mm7, mm3
  movq   mm2, [eax + 16]
  movq   [eax + 48], mm7    // libs 7
  movq   mm3, [eax + 24]
  movq   mm4, [eax + 32]
  movq   mm5, [eax + 40]
  movq   mm6, mm0
  pand   mm0, mm1
  pxor   mm1, mm6
  movq   mm6, mm1
  pand   mm1, mm2
  pxor   mm2, mm6
  por    mm0, mm1
  movq   mm6, mm3
  pand   mm3, mm4
  pxor   mm4, mm6
  movq   mm6, mm4
  pand   mm4, mm5
  pxor   mm5, mm6
  por    mm3, mm4
  movq   mm6, mm2
  pand   mm2, mm5
  pxor   mm5, mm6
  movq   mm6, mm5
  pand   mm5, mm7
  pxor   mm7, mm6
  por    mm2, mm5
  movq   mm6, mm0
  pand   mm0, mm3
  pxor   mm3, mm6
  movq   mm6, mm3
  pand   mm3, mm2
  pxor   mm2, mm6
  por    mm0, mm3
  movq   mm6, mm7
  psrld  mm7, 1
  pand   mm7, const_55
  psubd  mm6, mm7
  movq   mm7, mm6
  psrld  mm6, 2
  pand   mm7, const_33
  pand   mm6, const_33
  paddd  mm7, mm6
  movq   mm6, mm7
  psrld  mm7, 4
  paddd  mm7, mm6
  pand   mm7, const_0f
  pxor   mm6, mm6
  psadbw mm7, mm6
  movq   mm6, mm2
  psrld  mm2, 1
  pand   mm2, const_55
  psubd  mm6, mm2
  movq   mm2, mm6
  psrld  mm6, 2
  pand   mm2, const_33
  pand   mm6, const_33
  paddd  mm2, mm6
  movq   mm6, mm2
  psrld  mm2, 4
  paddd  mm2, mm6
  pand   mm2, const_0f
  pxor   mm6, mm6
  psadbw mm2, mm6
  paddd  mm2, mm2
  movq   mm6, mm0
  psrld  mm0, 1
  pand   mm0, const_55
  psubd  mm6, mm0
  movq   mm0, mm6
  psrld  mm6, 2
  pand   mm0, const_33
  pand   mm6, const_33
  paddd  mm0, mm6
  movq   mm6, mm0
  psrld  mm0, 4
  paddd  mm0, mm6
  pand   mm0, const_0f
  pxor   mm6, mm6
  psadbw mm0, mm6
  psllq  mm0, 2
  paddd  mm0, mm7
  paddd  mm0, mm2
  movd   eax, mm0
  emms  { TODO : speedup }
end;
*)


I am still not happy with it and hope that newer CPU's will have a
population count instruction (count the nr. of bits in a register).

_______________________________________________
computer-go mailing list
computer-go@xxxxxxxxxxxxxxxxx
http://www.computer-go.org/mailman/listinfo/computer-go/