Collect the directory to move

This commit is contained in:
Travis E. Oliphant 2012-11-10 18:43:19 -06:00
commit 4bc9f00d95
12 changed files with 1108 additions and 0 deletions

View file

View file

@ -0,0 +1,117 @@
'''
Base on the test_pthread.py and extend to use atomic instructions
'''
from llvm.core import *
from llvm.passes import *
from llvm.ee import *
from llvm_cbuilder import *
import llvm_cbuilder.shortnames as C
import unittest, logging
import sys
# logging.basicConfig(level=logging.DEBUG)
NUM_OF_THREAD = 4
REPEAT = 10000
def gen_test_worker(mod):
cb = CBuilder.new_function(mod, 'worker', C.void, [C.pointer(C.int)])
pval = cb.args[0]
one = cb.constant(pval.type.pointee, 1)
ct = cb.var(C.int, 0)
limit = cb.constant(C.int, REPEAT)
with cb.loop() as loop:
with loop.condition() as setcond:
setcond( ct < limit )
with loop.body():
cb.atomic_add(pval, one, 'acq_rel')
ct += one
cb.ret()
cb.close()
return cb.function
def gen_test_pthread(mod):
cb = CBuilder.new_function(mod, 'manager', C.int, [C.int])
arg = cb.args[0]
worker_func = cb.get_function_named('worker')
pthread_create = cb.get_function_named('pthread_create')
pthread_join = cb.get_function_named('pthread_join')
NULL = cb.constant_null(C.void_p)
cast_to_null = lambda x: x.cast(C.void_p)
threads = cb.array(C.void_p, NUM_OF_THREAD)
for tid in range(NUM_OF_THREAD):
pthread_create_args = [threads[tid].reference(),
NULL,
worker_func,
arg.reference()]
pthread_create(*map(cast_to_null, pthread_create_args))
worker_func(arg.reference())
for tid in range(NUM_OF_THREAD):
pthread_join_args = threads[tid], NULL
pthread_join(*map(cast_to_null, pthread_join_args))
cb.ret(arg)
cb.close()
return cb.function
class TestAtomicAdd(unittest.TestCase):
@unittest.skipIf(sys.platform == 'win32', "test uses pthreads, not supported on Windows")
def test_atomic_add(self):
mod = Module.new(__name__)
# add pthread functions
mod.add_function(Type.function(C.int,
[C.void_p, C.void_p, C.void_p, C.void_p]),
'pthread_create')
mod.add_function(Type.function(C.int,
[C.void_p, C.void_p]),
'pthread_join')
lf_test_worker = gen_test_worker(mod)
lf_test_pthread = gen_test_pthread(mod)
logging.debug(mod)
mod.verify()
# optimize
fpm = FunctionPassManager.new(mod)
mpm = PassManager.new()
pmb = PassManagerBuilder.new()
pmb.vectorize = True
pmb.opt_level = 3
pmb.populate(fpm)
pmb.populate(mpm)
fpm.run(lf_test_worker)
fpm.run(lf_test_pthread)
mpm.run(mod)
logging.debug(mod)
mod.verify()
# run
exe = CExecutor(mod)
exe.engine.get_pointer_to_function(mod.get_function_named('worker'))
func = exe.get_ctype_function(lf_test_pthread, 'int, int')
inarg = 1234
gold = inarg + (NUM_OF_THREAD + 1) * REPEAT
for _ in range(1000): # run many many times to catch race condition
self.assertEqual(func(inarg), gold, "Unexpected race condition")
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,124 @@
'''
Base on the test_pthread.py and extend to use atomic instructions
'''
from llvm.core import *
from llvm.passes import *
from llvm.ee import *
from llvm_cbuilder import *
import llvm_cbuilder.shortnames as C
import unittest, logging
import sys
# logging.basicConfig(level=logging.DEBUG)
NUM_OF_THREAD = 4
REPEAT = 10000
def gen_test_worker(mod):
cb = CBuilder.new_function(mod, 'worker', C.void, [C.pointer(C.int)])
pval = cb.args[0]
one = cb.constant(pval.type.pointee, 1)
ct = cb.var(C.int, 0)
limit = cb.constant(C.int, REPEAT)
with cb.loop() as loop:
with loop.condition() as setcond:
setcond( ct < limit )
with loop.body():
oldval = pval.atomic_load('acquire')
updated = oldval + one
castmp = pval.atomic_cmpxchg(oldval, updated, 'release')
with cb.ifelse( castmp == oldval ) as ifelse:
with ifelse.then():
ct += one
cb.ret()
cb.close()
return cb.function
def gen_test_pthread(mod):
cb = CBuilder.new_function(mod, 'manager', C.int, [C.int])
arg = cb.args[0]
worker_func = cb.get_function_named('worker')
pthread_create = cb.get_function_named('pthread_create')
pthread_join = cb.get_function_named('pthread_join')
NULL = cb.constant_null(C.void_p)
cast_to_null = lambda x: x.cast(C.void_p)
threads = cb.array(C.void_p, NUM_OF_THREAD)
for tid in range(NUM_OF_THREAD):
pthread_create_args = [threads[tid].reference(),
NULL,
worker_func,
arg.reference()]
pthread_create(*map(cast_to_null, pthread_create_args))
worker_func(arg.reference())
for tid in range(NUM_OF_THREAD):
pthread_join_args = threads[tid], NULL
pthread_join(*map(cast_to_null, pthread_join_args))
cb.ret(arg)
cb.close()
return cb.function
class TestAtomicCmpXchg(unittest.TestCase):
@unittest.skipIf(sys.platform == 'win32', "test uses pthreads, not supported on Windows")
def test_atomic_cmpxchg(self):
mod = Module.new(__name__)
# add pthread functions
mod.add_function(Type.function(C.int,
[C.void_p, C.void_p, C.void_p, C.void_p]),
'pthread_create')
mod.add_function(Type.function(C.int,
[C.void_p, C.void_p]),
'pthread_join')
lf_test_worker = gen_test_worker(mod)
lf_test_pthread = gen_test_pthread(mod)
logging.debug(mod)
mod.verify()
# optimize
fpm = FunctionPassManager.new(mod)
mpm = PassManager.new()
pmb = PassManagerBuilder.new()
pmb.vectorize = True
pmb.opt_level = 3
pmb.populate(fpm)
pmb.populate(mpm)
fpm.run(lf_test_worker)
fpm.run(lf_test_pthread)
mpm.run(mod)
logging.debug(mod)
mod.verify()
# run
exe = CExecutor(mod)
exe.engine.get_pointer_to_function(mod.get_function_named('worker'))
func = exe.get_ctype_function(lf_test_pthread, 'int, int')
inarg = 1234
gold = inarg + (NUM_OF_THREAD + 1) * REPEAT
for _ in range(1000): # run many many times to catch race condition
res = func(inarg)
self.assertEqual(res, gold,
"Unexpected race condition: res = %d" % res)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,17 @@
from llvm.core import *
from llvm_cbuilder import *
from llvm_cbuilder import shortnames as C
import unittest
class TestCstrCollide(unittest.TestCase):
def test_same_string(self):
mod = Module.new(__name__)
cb = CBuilder.new_function(mod, 'test_cstr_collide', C.void, [])
a = cb.constant_string("hello")
b = cb.constant_string("hello")
self.assertEqual(a.value, b.value)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,125 @@
from llvm.core import *
from llvm.passes import *
from llvm.ee import *
from llvm_cbuilder import *
import llvm_cbuilder.shortnames as C
import unittest, logging
def is_prime(x):
if x <= 2:
return True
if (x % 2) == 0:
return False
for y in range(2, int(1 + x**0.5)):
if (x % y) == 0:
return False
return True
def gen_is_prime(mod):
functype = Type.function(C.int, [C.int])
func = mod.add_function(functype, 'isprime')
cb = CBuilder(func)
arg = cb.args[0]
two = cb.constant(C.int, 2)
true = one = cb.constant(C.int, 1)
false = zero = cb.constant(C.int, 0)
with cb.ifelse( arg <= two ) as ifelse:
with ifelse.then():
cb.ret(true)
with cb.ifelse( (arg % two) == zero ) as ifelse:
with ifelse.then():
cb.ret(false)
idx = cb.var(C.int, 3, name='idx')
with cb.loop() as loop:
with loop.condition() as setcond:
setcond( idx < arg )
with loop.body():
with cb.ifelse( (arg % idx) == zero ) as ifelse:
with ifelse.then():
cb.ret(false)
# increment
idx += two
cb.ret(true)
cb.close()
return func
def gen_is_prime_fast(mod):
functype = Type.function(C.int, [C.int])
func = mod.add_function(functype, 'isprime_fast')
cb = CBuilder(func)
arg = cb.args[0]
two = cb.constant(C.int, 2)
true = one = cb.constant(C.int, 1)
false = zero = cb.constant(C.int, 0)
with cb.ifelse( arg <= two ) as ifelse:
with ifelse.then():
cb.ret(true)
with cb.ifelse( (arg % two) == zero ) as ifelse:
with ifelse.then():
cb.ret(false)
idx = cb.var(C.int, 3, name='idx')
sqrt = cb.get_intrinsic(INTR_SQRT, [C.float])
looplimit = one + sqrt(arg.cast(C.float)).cast(C.int)
with cb.loop() as loop:
with loop.condition() as setcond:
setcond( idx < looplimit )
with loop.body():
with cb.ifelse( (arg % idx) == zero ) as ifelse:
with ifelse.then():
cb.ret(false)
# increment
idx += two
cb.ret(true)
cb.close()
return func
class TestIsPrime(unittest.TestCase):
def test_isprime(self):
mod = Module.new(__name__)
lf_isprime = gen_is_prime(mod)
logging.debug(mod)
mod.verify()
exe = CExecutor(mod)
func = exe.get_ctype_function(lf_isprime, 'bool, int')
for x in range(2, 1000):
msg = "Failed at x = %d" % x
self.assertEqual(func(x), is_prime(x), msg)
def test_isprime_fast(self):
mod = Module.new(__name__)
lf_isprime = gen_is_prime_fast(mod)
logging.debug(mod)
mod.verify()
exe = CExecutor(mod)
func = exe.get_ctype_function(lf_isprime, 'bool, int')
for x in range(2, 1000):
msg = "Failed at x = %d" % x
self.assertEqual(func(x), is_prime(x), msg)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,135 @@
from llvm.core import *
from llvm.passes import *
from llvm.ee import *
from llvm_cbuilder import *
import llvm_cbuilder.shortnames as C
import unittest, logging
def loopbreak(d):
z = 0
for x in range(100):
for y in range(100):
z += x + y
if z > 50:
break
z -= d
return z
def gen_loopbreak(mod):
functype = Type.function(C.int, [C.int])
func = mod.add_function(functype, 'loopbreak')
cb = CBuilder(func)
d = cb.args[0]
x = cb.var(C.int)
y = cb.var(C.int)
z = cb.var(C.int)
one = cb.constant(C.int, 1)
zero = cb.constant(C.int, 0)
limit = cb.constant(C.int, 100)
fifty = cb.constant(C.int, 50)
z.assign(zero)
x.assign(zero)
with cb.loop() as outer:
with outer.condition() as setcond:
setcond( x < limit )
with outer.body():
y.assign(zero)
with cb.loop() as inner:
with inner.condition() as setcond:
setcond( y < limit )
with inner.body():
z += x + y
with cb.ifelse( z > fifty ) as ifelse:
with ifelse.then():
inner.break_loop()
y += one
z -= d
x += one
cb.ret(z)
cb.close()
return func
def loopcontinue(d):
z = 0
for x in range(100):
for y in range(100):
z += x + y
if z > 50:
continue
z += d
return z
def gen_loopcontinue(mod):
functype = Type.function(C.int, [C.int])
func = mod.add_function(functype, 'loopcontinue')
cb = CBuilder(func)
d = cb.args[0]
x = cb.var(C.int)
y = cb.var(C.int)
z = cb.var(C.int)
one = cb.constant(C.int, 1)
zero = cb.constant(C.int, 0)
limit = cb.constant(C.int, 100)
fifty = cb.constant(C.int, 50)
z.assign(zero)
x.assign(zero)
with cb.loop() as outer:
with outer.condition() as setcond:
setcond( x < limit )
with outer.body():
y.assign(zero)
with cb.loop() as inner:
with inner.condition() as setcond:
setcond( y < limit )
with inner.body():
z += x + y
y += one
with cb.ifelse( z > fifty ) as ifelse:
with ifelse.then():
inner.continue_loop()
z += d
x += one
cb.ret(z)
cb.close()
return func
class TestLoopControl(unittest.TestCase):
def test_loopbreak(self):
mod = Module.new(__name__)
lfunc = gen_loopbreak(mod)
logging.debug(mod)
mod.verify()
exe = CExecutor(mod)
func = exe.get_ctype_function(lfunc, 'int, int')
for x in range(100):
self.assertEqual(func(x), loopbreak(x))
def test_loopcontinue(self):
mod = Module.new(__name__)
lfunc = gen_loopcontinue(mod)
logging.debug(mod)
mod.verify()
exe = CExecutor(mod)
func = exe.get_ctype_function(lfunc, 'int, int')
for x in range(100):
self.assertEqual(func(x), loopcontinue(x))
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,128 @@
from llvm.core import *
from llvm.passes import *
from llvm.ee import *
from llvm_cbuilder import *
import llvm_cbuilder.shortnames as C
import unittest, logging
def nestedloop1(d):
z = 0
for x in range(100):
for y in range(100):
z += x * d + int(y / d)
return z
def gen_nestedloop1(mod):
functype = Type.function(C.int, [C.int])
func = mod.add_function(functype, 'nestedloop1')
cb = CBuilder(func)
d = cb.args[0]
x = cb.var(C.int)
y = cb.var(C.int)
z = cb.var(C.int)
one = cb.constant(C.int, 1)
zero = cb.constant(C.int, 0)
limit = cb.constant(C.int, 100)
z.assign(zero)
x.assign(zero)
with cb.loop() as outer:
with outer.condition() as setcond:
setcond( x < limit )
with outer.body():
y.assign(zero)
with cb.loop() as inner:
with inner.condition() as setcond:
setcond( y < limit )
with inner.body():
z += x * d + y / d
y += one
x += one
cb.ret(z)
cb.close()
return func
def nestedloop2(d):
z = 0
for x in range(1, 100):
for y in range(1, 100):
if x > y:
z += int(x / y) * d
else:
z += int(y / x) * d
return z
def gen_nestedloop2(mod):
functype = Type.function(C.int, [C.int])
func = mod.add_function(functype, 'nestedloop2')
cb = CBuilder(func)
d = cb.args[0]
x = cb.var(C.int)
y = cb.var(C.int)
z = cb.var(C.int)
one = cb.constant(C.int, 1)
zero = cb.constant(C.int, 0)
limit = cb.constant(C.int, 100)
z.assign(zero)
x.assign(one)
with cb.loop() as outer:
with outer.condition() as setcond:
setcond( x < limit )
with outer.body():
y.assign(one)
with cb.loop() as inner:
with inner.condition() as setcond:
setcond( y < limit )
with inner.body():
with cb.ifelse(x > y) as ifelse:
with ifelse.then():
z += x / y * d
with ifelse.otherwise():
z += y / x * d
y += one
x += one
cb.ret(z)
cb.close()
return func
class TestNestedLoop(unittest.TestCase):
def test_nestedloop1(self):
mod = Module.new(__name__)
lfunc = gen_nestedloop1(mod)
logging.debug(mod)
mod.verify()
exe = CExecutor(mod)
func = exe.get_ctype_function(lfunc, 'int, int')
for x in range(1, 100):
self.assertEqual(func(x), int(nestedloop1(x)))
def test_nestedloop2(self):
mod = Module.new(__name__)
lfunc = gen_nestedloop2(mod)
logging.debug(mod)
mod.verify()
exe = CExecutor(mod)
func = exe.get_ctype_function(lfunc, 'int, int')
for x in range(1, 100):
self.assertEqual(func(x), int(nestedloop2(x)))
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,68 @@
from llvm.core import *
from llvm.passes import *
from llvm.ee import *
from llvm_cbuilder import *
import llvm_cbuilder.shortnames as C
import sys, unittest, logging
from subprocess import Popen, PIPE
def gen_debugprint(mod):
functype = Type.function(C.void, [])
func = mod.add_function(functype, 'debugprint')
cb = CBuilder(func)
fmt = cb.constant_string("Show %d %.3f %.3e\n")
an_int = cb.constant(C.int, 123)
a_float = cb.constant(C.double, 1.234)
a_double = cb.constant(C.double, 1e-31)
cb.printf(fmt, an_int, a_float, a_double)
cb.debug('an_int =', an_int, 'a_float =', a_float, 'a_double =', a_double)
cb.ret()
cb.close()
return func
def main_debugprint():
# generate code
mod = Module.new(__name__)
lfunc = gen_debugprint(mod)
logging.debug(mod)
mod.verify()
# run
exe = CExecutor(mod)
func = exe.get_ctype_function(lfunc, 'void')
func()
class TestPrint(unittest.TestCase):
def test_debugprint(self):
p = Popen([sys.executable, __file__, "-child"], stdout=PIPE)
p.wait()
# The encode(utf-8) is for Python 3 compatibility
lines = p.stdout.read().encode('utf-8').splitlines(False)
# Try to account for variations in the system printf
if lines[0].find('e-031') >= 0:
expect = [
'Show 123 1.234 1.000e-031',
'an_int = 123 a_float = 1.234000e+000 a_double = 1.000000e-031',
]
else:
expect = [
'Show 123 1.234 1.000e-31',
'an_int = 123 a_float = 1.234000e+00 a_double = 1.000000e-31',
]
self.assertEqual(expect, lines)
p.stdout.close()
if __name__ == '__main__':
try:
if sys.argv[1] == '-child':
main_debugprint()
except IndexError:
unittest.main()

View file

@ -0,0 +1,92 @@
from llvm.core import *
from llvm.passes import *
from llvm.ee import *
from llvm_cbuilder import *
import llvm_cbuilder.shortnames as C
import unittest, logging
import sys
# logging.basicConfig(level=logging.DEBUG)
NUM_OF_THREAD = 4
def gen_test_worker(mod):
cb = CBuilder.new_function(mod, 'worker', C.void, [C.pointer(C.int)])
pval = cb.args[0]
val = pval.load()
one = cb.constant(val.type, 1)
pval.store(val + one)
cb.ret()
cb.close()
def gen_test_pthread(mod):
cb = CBuilder.new_function(mod, 'manager', C.int, [C.int])
arg = cb.args[0]
worker_func = cb.get_function_named('worker')
pthread_create = cb.get_function_named('pthread_create')
pthread_join = cb.get_function_named('pthread_join')
NULL = cb.constant_null(C.void_p)
cast_to_null = lambda x: x.cast(C.void_p)
threads = cb.array(C.void_p, NUM_OF_THREAD)
for tid in range(NUM_OF_THREAD):
pthread_create_args = [threads[tid].reference(),
NULL,
worker_func,
arg.reference()]
pthread_create(*map(cast_to_null, pthread_create_args))
worker_func(arg.reference())
for tid in range(NUM_OF_THREAD):
pthread_join_args = threads[tid], NULL
pthread_join(*map(cast_to_null, pthread_join_args))
cb.ret(arg)
cb.close()
return cb.function
class TestPThread(unittest.TestCase):
@unittest.skipIf(sys.platform == 'win32', "pthreads not supported on Windows")
def test_pthread(self):
mod = Module.new(__name__)
# add pthread functions
mod.add_function(Type.function(C.int,
[C.void_p, C.void_p, C.void_p, C.void_p]),
'pthread_create')
mod.add_function(Type.function(C.int,
[C.void_p, C.void_p]),
'pthread_join')
gen_test_worker(mod)
lf_test_pthread = gen_test_pthread(mod)
logging.debug(mod)
mod.verify()
exe = CExecutor(mod)
exe.engine.get_pointer_to_function(mod.get_function_named('worker'))
func = exe.get_ctype_function(lf_test_pthread, 'int, int')
inarg = 1234
gold = inarg + NUM_OF_THREAD + 1
self.assertLessEqual(func(inarg), gold)
# Cannot determine the exact return value due to untamed race condition
count_race = 0
for _ in range(2**12):
if func(inarg) != gold:
count_race += 1
if count_race > 0:
logging.info("Race condition occured %d times.", count_race)
logging.info("Race condition is expected.")
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,52 @@
from llvm.core import *
from llvm_cbuilder import *
import llvm_cbuilder.shortnames as C
import unittest, ctypes
class Vector2D(CStruct):
_fields_ = [
('x', C.float),
('y', C.float),
]
class Vector2DCtype(ctypes.Structure):
_fields_ = [
('x', ctypes.c_float),
('y', ctypes.c_float),
]
def gen_vector2d_dist(mod):
functype = Type.function(C.float, [C.pointer(Vector2D.llvm_type())])
func = mod.add_function(functype, 'vector2d_dist')
cb = CBuilder(func)
vec = cb.var(Vector2D, cb.args[0].load())
dist = vec.x * vec.x + vec.y * vec.y
cb.ret(dist)
cb.close()
return func
class TestStruct(unittest.TestCase):
def test_vector2d_dist(self):
# prepare module
mod = Module.new('mod')
lfunc = gen_vector2d_dist(mod)
mod.verify()
# run
exe = CExecutor(mod)
func = exe.get_ctype_function(lfunc, ctypes.c_float, ctypes.POINTER(Vector2DCtype))
from random import random
pydist = lambda x, y: x * x + y * y
for _ in range(100):
x, y = random(), random()
vec = Vector2DCtype(x=x, y=y)
ans = func(ctypes.pointer(vec))
gold = pydist(x, y)
self.assertLess(abs(ans-gold)/gold, 1e-6)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,94 @@
from llvm.core import Module
from llvm_cbuilder import *
from llvm_cbuilder.translator import translate
import llvm_cbuilder.shortnames as C
import unittest, logging
#logging.basicConfig(level=logging.DEBUG)
class FooIf(CDefinition):
_name_ = 'foo_if'
_retty_ = C.int
_argtys_ = [('x', C.int),
('y', C.int),]
def body(self, x, y):
@translate
def _():
if x > y:
return x - y
else:
return y - x
class FooWhile(CDefinition):
_name_ = 'foo_while'
_retty_ = C.int
_argtys_ = [('x', C.int)]
def body(self, x):
y = self.var_copy(x)
@translate
def _():
while x > 0:
x -= 1
y += x
return y
class FooForRange(CDefinition):
_name_ = 'foo_for_range'
_retty_ = C.int
_argtys_ = [('x', C.int)]
def body(self, x):
y = self.var(x.type, 0)
@translate
def _():
for i in range(x + 1):
y += i
return y
class TestTranslate(unittest.TestCase):
def test_if(self):
mod = Module.new(__name__)
lfoo = FooIf()(mod)
print(mod)
mod.verify()
exe = CExecutor(mod)
foo = exe.get_ctype_function(lfoo, 'int, int')
self.assertEqual(foo(10, 20), 20 - 10)
self.assertEqual(foo(23, 17), 23 - 17)
def test_whileloop(self):
mod = Module.new(__name__)
lfoo = FooWhile()(mod)
print(mod)
mod.verify()
exe = CExecutor(mod)
foo = exe.get_ctype_function(lfoo, 'int')
self.assertEqual(foo(10), sum(range(10+1)))
self.assertEqual(foo(1324), sum(range(1324+1)))
def test_forloop(self):
mod = Module.new(__name__)
lfoo = FooForRange()(mod)
print(mod)
mod.verify()
exe = CExecutor(mod)
foo = exe.get_ctype_function(lfoo, 'int')
self.assertEqual(foo(10), sum(range(10+1)))
self.assertEqual(foo(1324), sum(range(1324+1)))
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,156 @@
from llvm_cbuilder import *
from llvm_cbuilder import shortnames as C
from llvm_cbuilder.translator import translate
from ctypes import *
from llvm.core import *
from llvm.passes import *
import numpy as np
import unittest
import logging
floatv4 = C.vector(C.float, 4)
class VectorArith(CDefinition):
_name_ = 'vector_arith'
_argtys_ = [('a', floatv4),
('b', floatv4),
('c', floatv4),]
_retty_ = floatv4
def body(self, a, b, c):
'''
Arguments
---------
a, b, c -- must be vectors
'''
@translate
def _(): # write like python in here
return a * b + c
class VectorArithDriver1(CDefinition):
_name_ = 'vector_arith_driver_1'
_argtys_ = [('A', C.pointer(C.float)),
('B', C.pointer(C.float)),
('C', C.pointer(C.float)),
('D', C.pointer(C.float)),
('n', C.int),]
def body(self, Aary, Bary, Cary, Dary, n):
'''
This version uses vector load to fetch array elements as vectors.
'''
vecarith = self.depends(VectorArith())
elem_per_vec = self.constant(C.int, floatv4.count)
with self.for_range(0, n, elem_per_vec) as (loop, i):
# Aary[i:] offset the array at i
a = Aary[i:].vector_load(4, align=1) # unaligned vector load
b = Bary[i:].vector_load(4, align=1)
c = Cary[i:].vector_load(4, align=1)
r = vecarith(a, b, c)
Dary[i:].vector_store(r, align=1)
# self.debug(r[0], r[1], r[2], r[3])
self.ret()
class VectorArithDriver2(CDefinition):
_name_ = 'vector_arith_driver_2'
_argtys_ = [('A', C.pointer(C.float)),
('B', C.pointer(C.float)),
('C', C.pointer(C.float)),
('D', C.pointer(C.float)),
('n', C.int),]
def body(self, Aary, Bary, Cary, Dary, n):
'''
This version loads element of vector individually.
This style generates scalar ld/st instead of vector ld/st.
'''
vecarith = self.depends(VectorArith())
a = self.var(floatv4)
b = self.var(floatv4)
c = self.var(floatv4)
elem_per_vec = self.constant(C.int, floatv4.count)
with self.for_range(0, n, elem_per_vec) as (outer, i):
with self.for_range(elem_per_vec) as (inner, j):
a[j] = Aary[i + j]
b[j] = Bary[i + j]
c[j] = Cary[i + j]
r = vecarith(a, b, c)
Dary[i:].vector_store(r, align=1)
# self.debug(r[0], r[1], r[2], r[3])
self.ret()
def aligned_zeros(shape, boundary=16, dtype=float, order='C'):
'''
Is there a better way to allocate aligned memory?
'''
N = np.prod(shape)
d = np.dtype(dtype)
tmp = np.zeros(N * d.itemsize + boundary, dtype=np.uint8)
address = tmp.__array_interface__['data'][0]
offset = (boundary - address % boundary) % boundary
viewed = tmp[offset:offset + N * d.itemsize].view(dtype=d)
return viewed.reshape(shape, order=order)
class TestVectorArith(unittest.TestCase):
def test_vector_arith_1(self):
self.run_and_test_udt(VectorArithDriver1(), 16) # aligned for SSE
self.run_and_test_udt(VectorArithDriver1(), 20) # misaligned for SSE
def test_vector_arith_2(self):
self.run_and_test_udt(VectorArithDriver2(), 16) # aligned for SSE
self.run_and_test_udt(VectorArithDriver2(), 20) # misaligned for SSE
def run_and_test_udt(self, udt, align):
module = Module.new('mod.test.vectoriarith')
ldriver = udt(module)
pm = PassManager.new()
pmb = PassManagerBuilder.new()
pmb.opt = 3
pmb.vectorize = True
pmb.populate(pm)
pm.run(module)
print(module.to_native_assembly())
exe = CExecutor(module)
float_p = POINTER(c_float)
driver = exe.get_ctype_function(ldriver,
None,
float_p, float_p, float_p,
float_p,
c_int)
# prepare for execution
n = 4*10
Aary = aligned_zeros(n, boundary=align, dtype=np.float32)
Bary = aligned_zeros(n, boundary=align, dtype=np.float32)
Cary = aligned_zeros(n, boundary=align, dtype=np.float32)
Dary = aligned_zeros(n, boundary=align, dtype=np.float32)
Aary[:] = range(n)
Bary[:] = range(n, 2 * n)
Cary[:] = range(2 * n, 3 * n)
golden = Aary * Bary + Cary
getptr = lambda ary: ary.ctypes.data_as(float_p)
driver(getptr(Aary), getptr(Bary), getptr(Cary), getptr(Dary), n)
for x, y in zip(golden, Dary):
self.assertEqual(x, y)
if __name__ == '__main__':
unittest.main()