diff --git a/notebook/parallel_vectorize.ipynb b/notebook/parallel_vectorize.ipynb
new file mode 100644
index 0000000..b0bdaff
--- /dev/null
+++ b/notebook/parallel_vectorize.ipynb
@@ -0,0 +1,160 @@
+{
+ "metadata": {
+  "name": "parallel_vectorize"
+ }, 
+ "nbformat": 2, 
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "markdown", 
+     "source": [
+      "Parallel Vectorize", 
+      "------------------", 
+      "", 
+      "The `parallel_vectorize.py` module contains a set of llvmpy code generators", 
+      "for creating mulithreaded _ufunc_. ", 
+      "It depends on the new `numpy.fromfunc` for turning arbitrary function pointers into _ufunc_.", 
+      "", 
+      "From LLVM Function", 
+      "------------------", 
+      "", 
+      "The `parallel_vectorize_from_func` method generates multithreaded _ufunc_ from LLVM functions.", 
+      "", 
+      "First, we will implement a workload function:"
+     ]
+    }, 
+    {
+     "cell_type": "code", 
+     "collapsed": false, 
+     "input": [
+      "from llvm_cbuilder import *", 
+      "from llvm_cbuilder import shortnames as C", 
+      "from llvm.core import *", 
+      "", 
+      "# Implement a workload", 
+      "class Square(CDefinition):", 
+      "    _name_ = 'square'", 
+      "    _retty_ = C.double             # 1 output: double", 
+      "    _argtys_ = [('x', C.double)]   # 1 input: double", 
+      "    ", 
+      "    def body(self, x):", 
+      "        self.ret(x * x)", 
+      "", 
+      "m = Module.new('my_module')", 
+      "llvm_square = Square()(m)  # Generate a llvm function", 
+      "print(llvm_square)         "
+     ], 
+     "language": "python", 
+     "outputs": [
+      {
+       "output_type": "stream", 
+       "stream": "stdout", 
+       "text": [
+        "", 
+        "define double @square(double %x) {", 
+        "decl:", 
+        "  %x1 = alloca double", 
+        "  br label %body", 
+        "", 
+        "body:                                             ; preds = %decl", 
+        "  store double %x, double* %x1", 
+        "  %0 = load double* %x1", 
+        "  %1 = load double* %x1", 
+        "  %2 = fmul double %0, %1", 
+        "  ret double %2", 
+        "}", 
+        ""
+       ]
+      }
+     ], 
+     "prompt_number": 1
+    }, 
+    {
+     "cell_type": "markdown", 
+     "source": [
+      "Then, we will generate a _ufunc_ from `llvm_square`:"
+     ]
+    }, 
+    {
+     "cell_type": "code", 
+     "collapsed": false, 
+     "input": [
+      "from llvm.ee import *", 
+      "engine = EngineBuilder.new(m).create()         # Generate JIT engine", 
+      "", 
+      "from parallel_vectorize import parallel_vectorize_from_func", 
+      "ufunc_square = parallel_vectorize_from_func(llvm_square, engine)  # Generate UFunc"
+     ], 
+     "language": "python", 
+     "outputs": [], 
+     "prompt_number": 2
+    }, 
+    {
+     "cell_type": "markdown", 
+     "source": [
+      "We are ready to use `ufunc_square` as a regular _ufunc_."
+     ]
+    }, 
+    {
+     "cell_type": "code", 
+     "collapsed": false, 
+     "input": [
+      "import numpy as np", 
+      "A = np.arange(10., dtype=np.double)", 
+      "ufunc_square(A)"
+     ], 
+     "language": "python", 
+     "outputs": [
+      {
+       "output_type": "pyout", 
+       "prompt_number": 3, 
+       "text": [
+        "array([  0.,   1.,   4.,   9.,  16.,  25.,  36.,  49.,  64.,  81.])"
+       ]
+      }
+     ], 
+     "prompt_number": 3
+    }, 
+    {
+     "cell_type": "markdown", 
+     "source": [
+      "Here's another example that uses three inputs:"
+     ]
+    }, 
+    {
+     "cell_type": "code", 
+     "collapsed": false, 
+     "input": [
+      "class SumOfThree(CDefinition):", 
+      "    _name_ = 'sum.of.three'", 
+      "    _retty_ = C.int", 
+      "    _argtys_ = [('x', C.int),", 
+      "                ('y', C.int),", 
+      "                ('z', C.int)]", 
+      "    def body(self, x, y, z):", 
+      "        self.ret( x + y + z )", 
+      "", 
+      "llvm_sum3 = SumOfThree()(m)", 
+      "ufunc_sum3 = parallel_vectorize_from_func(llvm_sum3, engine)", 
+      "A = np.arange(10, dtype=np.int32)", 
+      "B = A * 10", 
+      "C = A * 100", 
+      "ufunc_sum3(A, B, C)"
+     ], 
+     "language": "python", 
+     "outputs": [
+      {
+       "output_type": "pyout", 
+       "prompt_number": 4, 
+       "text": [
+        "array([  0, 111, 222, 333, 444, 555, 666, 777, 888, 999], dtype=int32)"
+       ]
+      }
+     ], 
+     "prompt_number": 4
+    }
+   ]
+  }
+ ]
+}
\ No newline at end of file
diff --git a/parallel_vectorize.py b/parallel_vectorize.py
index 210dea8..a3fc743 100644
--- a/parallel_vectorize.py
+++ b/parallel_vectorize.py
@@ -416,10 +416,14 @@ def parallel_vectorize_from_func(lfunc, engine=None):
     from python. (This needs Jay's numpy.fromfunc).
     Otherwise, return the specialized ufunc as a llvm.core.Function
     '''
+    import multiprocessing
+    NUM_CPU = multiprocessing.cpu_count()
+
     fntype = lfunc.type.pointee
-    def_spuf = SpecializedParallelUFunc(ParallelUFuncPlatform(num_thread=2),
-                                        UFuncCoreGeneric(fntype),
-                                        CFuncRef(lfunc))
+    def_spuf = SpecializedParallelUFunc(
+                                ParallelUFuncPlatform(num_thread=NUM_CPU),
+                                UFuncCoreGeneric(fntype),
+                                CFuncRef(lfunc))
     spuf = def_spuf(lfunc.module)
     if engine is None:
         return spuf
@@ -432,10 +436,10 @@ def parallel_vectorize_from_func(lfunc, engine=None):
 
         # TODO refactor
         typemap = {
-            'i8'     : np.uint8,
-            'i16'    : np.uint16,
-            'i32'    : np.uint32,
-            'i64'    : np.uint64,
+            'i8'     : np.int8,
+            'i16'    : np.int16,
+            'i32'    : np.int32,
+            'i64'    : np.int64,
             'float'  : np.float32,
             'double' : np.float64,
         }