Slide 39
Slide 39 text
@chewxy
on
Twi-er
//
func
scale64(s
float64,
a
[]float64)
TEXT
·∙scale64(SB),
7,
$0
MOVSD
s+0(FP),
X0
MOVQ
x_data+8(FP),
SI
//
move
the
first
element
to
the
indexing
register
MOVQ
x_len+24(FP),
BP
//
move
the
length
into
BP
//
check
that
there
is
more
than
4
values
in
the
slice
SUBQ
$4,
BP
JL
remainders
//
jump
to
remainder
//
manually
broadcast
s
into
X0's
lower
bits,
which
is
used
for
SIMD
MOVLHPS
X0,
X0
loop:
//
load
the
first
two
elements,
and
then
multiply
by
s
(which
is
in
X0)
MOVUPD
(SI),
X2
MULPD
X0,
X2
//
load
second
two
elements,
then
multiply
by
s
MOVUPD
16(SI),
X4
MULPD
X0,
X4
//
move
the
values
back
to
the
original
positions
MOVUPD
X2,
(SI)
MOVUPD
X4,
16(SI)
//
now
that
we
are
done
with
the
first
4
elements,
update
the
pointers
to
the
top
of
the
next
4
elements
ADDQ
$32,
SI
SUBQ
$4,
BP
//
the
len
is
now
less
4
JGE
loop
remainders:
ADDQ
$4,
BP
//
we
subtracted
earlier,
remember??!!!
JEQ
done
//
if
there
are
nothing
left
to
process,
we'll
go
to
done
remainingloop:
//
load
element
from
x,
and
then
multiply
by
s
(which
in
X0)
//
TODO:
check
if
there
is
a
performance
penalty
on
using
SSE
registers
MOVSD
(SI),
X2
MULSD
X0,
X2
//
save
it
back
to
the
array
MOVSD
X2,
(SI)
//
update
the
pointer
ADDQ
$8,
SI
DECQ
BP
JNE
remainingloop
done:
RET