457 lines
16 KiB
Python
457 lines
16 KiB
Python
'''
|
|
This file implements the code-generator for parallel-vectorize.
|
|
|
|
ParallelUFunc is the platform independent base class for generating
|
|
the thread dispatcher. This thread dispatcher launches threads
|
|
that execute the generated function of UFuncCore.
|
|
UFuncCore is subclassed to specialize for the input/output types.
|
|
The actual workload is invoked inside the function generated by UFuncCore.
|
|
UFuncCore also defines a work-stealing mechanism that allows idle threads
|
|
to steal works from other threads.
|
|
'''
|
|
|
|
from llvm.core import *
|
|
from llvm.passes import *
|
|
|
|
from llvm_cbuilder import *
|
|
import llvm_cbuilder.shortnames as C
|
|
|
|
import sys
|
|
|
|
class WorkQueue(CStruct):
|
|
'''structure for workqueue for parallel-ufunc.
|
|
'''
|
|
|
|
_fields_ = [
|
|
('next', C.intp), # next index of work item
|
|
('last', C.intp), # last index of work item (exlusive)
|
|
('lock', C.int), # for locking the workqueue
|
|
]
|
|
|
|
|
|
def Lock(self):
|
|
'''inline the lock procedure.
|
|
'''
|
|
with self.parent.loop() as loop:
|
|
with loop.condition() as setcond:
|
|
unlocked = self.parent.constant(self.lock.type, 0)
|
|
locked = self.parent.constant(self.lock.type, 1)
|
|
|
|
res = self.lock.reference().atomic_cmpxchg(unlocked, locked,
|
|
ordering='acquire')
|
|
setcond( res != unlocked )
|
|
|
|
with loop.body():
|
|
pass
|
|
|
|
def Unlock(self):
|
|
'''inline the unlock procedure.
|
|
'''
|
|
unlocked = self.parent.constant(self.lock.type, 0)
|
|
locked = self.parent.constant(self.lock.type, 1)
|
|
|
|
res = self.lock.reference().atomic_cmpxchg(locked, unlocked,
|
|
ordering='release')
|
|
|
|
with self.parent.ifelse( res != locked ) as ifelse:
|
|
with ifelse.then():
|
|
# This shall kill the program
|
|
self.parent.unreachable()
|
|
|
|
|
|
class ContextCommon(CStruct):
|
|
'''structure for thread-shared context information in parallel-ufunc.
|
|
'''
|
|
_fields_ = [
|
|
# loop ufunc args
|
|
('args', C.pointer(C.char_p)),
|
|
('dimensions', C.pointer(C.intp)),
|
|
('steps', C.pointer(C.intp)),
|
|
('data', C.void_p),
|
|
# specifics for work queues
|
|
('func', C.void_p),
|
|
('num_thread', C.int),
|
|
('workqueues', C.pointer(WorkQueue.llvm_type())),
|
|
]
|
|
|
|
class Context(CStruct):
|
|
'''structure for thread-specific context information in parallel-ufunc.
|
|
'''
|
|
_fields_ = [
|
|
('common', C.pointer(ContextCommon.llvm_type())),
|
|
('id', C.int),
|
|
('completed', C.intp),
|
|
]
|
|
|
|
class ParallelUFunc(CDefinition):
|
|
'''the generic parallel vectorize mechanism
|
|
|
|
Can be specialized to the maximum number of threads on the platform.
|
|
|
|
|
|
Platform dependent threading function is implemented in
|
|
|
|
def _dispatch_worker(self, worker, contexts, num_thread):
|
|
...
|
|
|
|
which should be implemented in subclass or mixin.
|
|
'''
|
|
|
|
_argtys_ = [
|
|
('func', C.void_p),
|
|
('worker', C.void_p),
|
|
('args', C.pointer(C.char_p)),
|
|
('dimensions', C.pointer(C.intp)),
|
|
('steps', C.pointer(C.intp)),
|
|
('data', C.void_p),
|
|
]
|
|
|
|
@classmethod
|
|
def specialize(cls, num_thread):
|
|
'''specialize to the maximum # of thread
|
|
'''
|
|
cls._name_ = 'parallel_ufunc_%d' % num_thread
|
|
cls.ThreadCount = num_thread
|
|
|
|
def body(self, func, worker, args, dimensions, steps, data):
|
|
# Setup variables
|
|
ThreadCount = self.ThreadCount
|
|
common = self.var(ContextCommon, name='common')
|
|
workqueues = self.array(WorkQueue, ThreadCount, name='workqueues')
|
|
contexts = self.array(Context, ThreadCount, name='contexts')
|
|
|
|
num_thread = self.var(C.int, ThreadCount, name='num_thread')
|
|
|
|
# Initialize ContextCommon
|
|
common.args.assign(args)
|
|
common.dimensions.assign(dimensions)
|
|
common.steps.assign(steps)
|
|
common.data.assign(data)
|
|
common.func.assign(func)
|
|
common.num_thread.assign(num_thread.cast(C.int))
|
|
common.workqueues.assign(workqueues.reference())
|
|
|
|
# Determine chunksize, initial count of work-items per thread.
|
|
# If total_work >= num_thread, equally divide the works.
|
|
# If total_work % num_thread != 0, the last thread does all remaining works.
|
|
# If total_work < num_thread, each thread does one work,
|
|
# and set num_thread to total_work
|
|
N = dimensions[0]
|
|
ChunkSize = self.var_copy(N / num_thread.cast(N.type))
|
|
ChunkSize_NULL = self.constant_null(ChunkSize.type)
|
|
with self.ifelse(ChunkSize == ChunkSize_NULL) as ifelse:
|
|
with ifelse.then():
|
|
ChunkSize.assign(self.constant(ChunkSize.type, 1))
|
|
num_thread.assign(N.cast(num_thread.type))
|
|
|
|
# Populate workqueue for all threads
|
|
self._populate_workqueues(workqueues, N, ChunkSize, num_thread)
|
|
|
|
# Populate contexts for all threads
|
|
self._populate_context(contexts, common, num_thread)
|
|
|
|
# Dispatch worker threads
|
|
self._dispatch_worker(worker, contexts, num_thread)
|
|
|
|
## DEBUG ONLY ##
|
|
# Check for race condition
|
|
if True:
|
|
total_completed = self.var(C.intp, 0, name='total_completed')
|
|
for t in range(ThreadCount):
|
|
cur_ctxt = contexts[t].as_struct(Context)
|
|
total_completed += cur_ctxt.completed
|
|
# self.debug(cur_ctxt.id, 'completed', cur_ctxt.completed)
|
|
|
|
with self.ifelse( total_completed == N ) as ifelse:
|
|
with ifelse.then():
|
|
# self.debug("All is well!")
|
|
pass # keep quite if all is well
|
|
with ifelse.otherwise():
|
|
self.debug("ERROR: race occurred! Trigger segfault")
|
|
self.unreachable()
|
|
|
|
# Return
|
|
self.ret()
|
|
|
|
def _populate_workqueues(self, workqueues, N, ChunkSize, num_thread):
|
|
'''loop over all threads and populate the workqueue for each of them.
|
|
'''
|
|
ONE = self.constant(num_thread.type, 1)
|
|
with self.for_range(num_thread) as (loop, i):
|
|
cur_wq = workqueues[i].as_struct(WorkQueue)
|
|
cur_wq.next.assign(i.cast(ChunkSize.type) * ChunkSize)
|
|
cur_wq.last.assign((i + ONE).cast(ChunkSize.type) * ChunkSize)
|
|
cur_wq.lock.assign(self.constant(C.int, 0))
|
|
# end loop
|
|
last_wq = workqueues[num_thread - ONE].as_struct(WorkQueue)
|
|
last_wq.last.assign(N)
|
|
|
|
def _populate_context(self, contexts, common, num_thread):
|
|
'''loop over all threads and populate contexts for each of them.
|
|
'''
|
|
ONE = self.constant(num_thread.type, 1)
|
|
with self.for_range(num_thread) as (loop, i):
|
|
cur_ctxt = contexts[i].as_struct(Context)
|
|
cur_ctxt.common.assign(common.reference())
|
|
cur_ctxt.id.assign(i)
|
|
cur_ctxt.completed.assign(
|
|
self.constant_null(cur_ctxt.completed.type))
|
|
|
|
class ParallelUFuncPosixMixin(object):
|
|
'''ParallelUFunc mixin that implements _dispatch_worker to use pthread.
|
|
'''
|
|
def _dispatch_worker(self, worker, contexts, num_thread):
|
|
api = PThreadAPI(self)
|
|
NULL = self.constant_null(C.void_p)
|
|
|
|
threads = self.array(api.pthread_t, num_thread, name='threads')
|
|
|
|
# self.debug("launch threads")
|
|
# TODO error handling
|
|
|
|
ONE = self.constant(num_thread.type, 1)
|
|
with self.for_range(num_thread) as (loop, i):
|
|
api.pthread_create(threads[i].reference(), NULL, worker,
|
|
contexts[i].reference().cast(C.void_p))
|
|
|
|
with self.for_range(num_thread) as (loop, i):
|
|
api.pthread_join(threads[i], NULL)
|
|
|
|
class UFuncCore(CDefinition):
|
|
'''core work of a ufunc worker thread
|
|
|
|
Subclass to implement UFuncCore._do_work
|
|
|
|
Generates the workqueue handling and work stealing and invoke
|
|
the work function for each work item.
|
|
'''
|
|
_name_ = 'ufunc_worker'
|
|
_argtys_ = [
|
|
('context', C.pointer(Context.llvm_type())),
|
|
]
|
|
|
|
def body(self, context):
|
|
context = context.as_struct(Context)
|
|
common = context.common.as_struct(ContextCommon)
|
|
tid = context.id
|
|
|
|
# self.debug("start thread", tid, "/", common.num_thread)
|
|
workqueue = common.workqueues[tid].as_struct(WorkQueue)
|
|
|
|
self._do_workqueue(common, workqueue, tid, context.completed)
|
|
self._do_work_stealing(common, tid, context.completed) # optional
|
|
|
|
self.ret()
|
|
|
|
def _do_workqueue(self, common, workqueue, tid, completed):
|
|
'''process local workqueue.
|
|
'''
|
|
ZERO = self.constant_null(C.int)
|
|
|
|
with self.forever() as loop:
|
|
workqueue.Lock()
|
|
# Critical section
|
|
item = self.var_copy(workqueue.next, name='item')
|
|
workqueue.next += self.constant(item.type, 1)
|
|
last = self.var_copy(workqueue.last, name='last')
|
|
# Release
|
|
workqueue.Unlock()
|
|
|
|
with self.ifelse( item >= last ) as ifelse:
|
|
with ifelse.then():
|
|
loop.break_loop()
|
|
|
|
self._do_work(common, item, tid)
|
|
completed += self.constant(completed.type, 1)
|
|
|
|
def _do_work_stealing(self, common, tid, completed):
|
|
'''steal work from other workqueues.
|
|
'''
|
|
# self.debug("start work stealing", tid)
|
|
steal_continue = self.var(C.int, 1)
|
|
STEAL_STOP = self.constant_null(steal_continue.type)
|
|
|
|
# Loop until all workqueues are done.
|
|
with self.loop() as loop:
|
|
with loop.condition() as setcond:
|
|
setcond( steal_continue != STEAL_STOP )
|
|
|
|
with loop.body():
|
|
steal_continue.assign(STEAL_STOP)
|
|
self._do_work_stealing_innerloop(common, steal_continue, tid,
|
|
completed)
|
|
|
|
def _do_work_stealing_innerloop(self, common, steal_continue, tid,
|
|
completed):
|
|
'''loop over all other threads and try to steal work.
|
|
'''
|
|
with self.for_range(common.num_thread) as (loop, i):
|
|
with self.ifelse( i != tid ) as ifelse:
|
|
with ifelse.then():
|
|
otherqueue = common.workqueues[i].as_struct(WorkQueue)
|
|
self._do_work_stealing_check(common, otherqueue,
|
|
steal_continue, tid,
|
|
completed)
|
|
|
|
def _do_work_stealing_check(self, common, otherqueue, steal_continue, tid,
|
|
completed):
|
|
'''check the workqueue for any remaining work and steal it.
|
|
'''
|
|
otherqueue.Lock()
|
|
# Acquired
|
|
ONE = self.constant(otherqueue.last.type, 1)
|
|
STEAL_CONTINUE = self.constant(steal_continue.type, 1)
|
|
with self.ifelse(otherqueue.next < otherqueue.last) as ifelse:
|
|
with ifelse.then():
|
|
otherqueue.last -= ONE
|
|
item = self.var_copy(otherqueue.last)
|
|
|
|
otherqueue.Unlock()
|
|
# Released
|
|
|
|
self._do_work(common, item, tid)
|
|
completed += self.constant(completed.type, 1)
|
|
|
|
# Mark incomplete thread
|
|
steal_continue.assign(STEAL_CONTINUE)
|
|
|
|
with ifelse.otherwise():
|
|
otherqueue.Unlock()
|
|
# Released
|
|
|
|
def _do_work(self, common, item, tid):
|
|
'''prepare to call the actual work function
|
|
|
|
Implementation depends on number and type of arguments.
|
|
'''
|
|
raise NotImplementedError
|
|
|
|
class SpecializedParallelUFunc(CDefinition):
|
|
'''a generic ufunc that wraps ParallelUFunc, UFuncCore and the workload
|
|
'''
|
|
_argtys_ = [
|
|
('args', C.pointer(C.char_p)),
|
|
('dimensions', C.pointer(C.intp)),
|
|
('steps', C.pointer(C.intp)),
|
|
('data', C.void_p),
|
|
]
|
|
|
|
def body(self, args, dimensions, steps, data,):
|
|
pufunc = self.depends(self.PUFuncDef)
|
|
core = self.depends(self.CoreDef)
|
|
func = self.depends(self.FuncDef)
|
|
to_void_p = lambda x: x.cast(C.void_p)
|
|
pufunc(to_void_p(func), to_void_p(core), args, dimensions, steps, data)
|
|
self.ret()
|
|
|
|
@classmethod
|
|
def specialize(cls, pufunc_def, core_def, func_def):
|
|
'''specialize to a combination of ParallelUFunc, UFuncCore and workload
|
|
'''
|
|
cls._name_ = 'specialized_%s_%s_%s'% (pufunc_def, core_def, func_def)
|
|
cls.PUFuncDef = pufunc_def
|
|
cls.CoreDef = core_def
|
|
cls.FuncDef = func_def
|
|
|
|
class PThreadAPI(CExternal):
|
|
'''external declaration of pthread API
|
|
'''
|
|
pthread_t = C.void_p
|
|
|
|
pthread_create = Type.function(C.int,
|
|
[C.pointer(pthread_t), # thread_t
|
|
C.void_p, # thread attr
|
|
C.void_p, # function
|
|
C.void_p]) # arg
|
|
|
|
pthread_join = Type.function(C.int, [C.void_p, C.void_p])
|
|
|
|
|
|
class UFuncCoreGeneric(UFuncCore):
|
|
'''A generic ufunc core worker from LLVM function type
|
|
'''
|
|
def _do_work(self, common, item, tid):
|
|
ufunc_type = Type.function(self.RETTY, self.ARGTYS)
|
|
ufunc_ptr = CFunc(self, common.func.cast(C.pointer(ufunc_type)).value)
|
|
|
|
get_offset = lambda B, S, T: B[item * S].reference().cast(C.pointer(T))
|
|
|
|
indata = []
|
|
for i, argty in enumerate(self.ARGTYS):
|
|
ptr = get_offset(common.args[i], common.steps[i], argty)
|
|
indata.append(ptr.load())
|
|
|
|
out_index = len(self.ARGTYS)
|
|
outptr = get_offset(common.args[out_index], common.steps[out_index],
|
|
self.RETTY)
|
|
|
|
res = ufunc_ptr(*indata)
|
|
outptr.store(res)
|
|
|
|
@classmethod
|
|
def specialize(cls, fntype):
|
|
'''specialize to a LLVM function type
|
|
|
|
fntype : a LLVM function type (llvm.core.FunctionType)
|
|
'''
|
|
cls._name_ = '.'.join([cls._name_] +
|
|
map(str, [fntype.return_type] + fntype.args))
|
|
|
|
cls.RETTY = fntype.return_type
|
|
cls.ARGTYS = tuple(fntype.args)
|
|
|
|
|
|
if sys.platform not in ['win32']:
|
|
class ParallelUFuncPlatform(ParallelUFunc, ParallelUFuncPosixMixin):
|
|
pass
|
|
else:
|
|
raise NotImplementedError("Threading for %s" % sys.platform)
|
|
|
|
|
|
|
|
def parallel_vectorize_from_func(lfunc, engine=None):
|
|
'''create ufunc from a llvm.core.Function
|
|
|
|
If engine is given, return a function object which can be called
|
|
from python. (This needs Jay's numpy.fromfunc).
|
|
Otherwise, return the specialized ufunc as a llvm.core.Function
|
|
'''
|
|
fntype = lfunc.type.pointee
|
|
def_spuf = SpecializedParallelUFunc(ParallelUFuncPlatform(num_thread=2),
|
|
UFuncCoreGeneric(fntype),
|
|
CFuncRef(lfunc))
|
|
spuf = def_spuf(lfunc.module)
|
|
if engine is None:
|
|
return spuf
|
|
else:
|
|
import numpy as np
|
|
|
|
fptr = engine.get_pointer_to_function(spuf)
|
|
inct = len(fntype.args)
|
|
outct = 1
|
|
|
|
# TODO refactor
|
|
typemap = {
|
|
'i8' : np.uint8,
|
|
'i16' : np.uint16,
|
|
'i32' : np.uint32,
|
|
'i64' : np.uint64,
|
|
'float' : np.float32,
|
|
'double' : np.float64,
|
|
}
|
|
try:
|
|
ptr_t = long
|
|
except:
|
|
ptr_t = int
|
|
assert False, "Having check this yet"
|
|
|
|
get_typenum = lambda T:np.dtype(typemap[str(T)]).num
|
|
assert fntype.return_type != C.void
|
|
tys = list(map(get_typenum, list(fntype.args) + [fntype.return_type]))
|
|
# Becareful that fromfunc does not provide full error checking yet.
|
|
# If typenum is out-of-bound, we have nasty memory corruptions.
|
|
# For instance, -1 for typenum will cause segfault.
|
|
# If elements of type-list (2nd arg) is tuple instead,
|
|
# there will also memory corruption. (Seems like code rewrite.)
|
|
return np.fromfunc([ptr_t(fptr)], [tys], inct, outct, [None])
|
|
|