From d0f0bae1c5093d0e307b45c974993308728bcea7 Mon Sep 17 00:00:00 2001 From: "Travis E. Oliphant" Date: Wed, 8 Aug 2012 17:49:24 -0500 Subject: [PATCH] Add Sphinx documentation. --- docs/Makefile | 157 ++ docs/gh-pages.py | 138 ++ docs/source/conf.py | 242 +++ docs/source/doc/comparision.md | 119 ++ docs/source/doc/comparision.rst | 147 ++ docs/source/doc/examples.md | 154 ++ docs/source/doc/examples/JITTutorial1.md | 38 + docs/source/doc/examples/JITTutorial2.md | 57 + docs/source/doc/functions.md | 198 ++ .../doc/kaleidoscope/PythonLangImpl1.md | 330 +++ .../doc/kaleidoscope/PythonLangImpl2.md | 998 +++++++++ .../doc/kaleidoscope/PythonLangImpl3.md | 1062 ++++++++++ .../doc/kaleidoscope/PythonLangImpl4.md | 941 +++++++++ .../doc/kaleidoscope/PythonLangImpl5.md | 1464 ++++++++++++++ .../doc/kaleidoscope/PythonLangImpl6.md | 1535 ++++++++++++++ .../doc/kaleidoscope/PythonLangImpl7.md | 1794 +++++++++++++++++ .../doc/kaleidoscope/PythonLangImpl8.md | 275 +++ docs/source/doc/llvm-py_package.md | 92 + docs/source/doc/llvm.core.Argument.md | 57 + docs/source/doc/llvm.core.ArrayType.md | 26 + docs/source/doc/llvm.core.BasicBlock.md | 40 + docs/source/doc/llvm.core.Builder.md | 326 +++ docs/source/doc/llvm.core.Constant.md | 296 +++ docs/source/doc/llvm.core.Function.md | 129 ++ docs/source/doc/llvm.core.FunctionType.md | 53 + docs/source/doc/llvm.core.GlobalValue.md | 99 + docs/source/doc/llvm.core.GlobalVariable.md | 94 + docs/source/doc/llvm.core.Instruction.md | 215 ++ docs/source/doc/llvm.core.IntegerType.md | 20 + docs/source/doc/llvm.core.Module.md | 219 ++ docs/source/doc/llvm.core.PointerType.md | 26 + docs/source/doc/llvm.core.StructType.md | 70 + docs/source/doc/llvm.core.Type.md | 148 ++ docs/source/doc/llvm.core.User.md | 34 + docs/source/doc/llvm.core.Value.md | 50 + docs/source/doc/llvm.core.VectorType.md | 26 + docs/source/doc/llvm.ee.EngineBuilder.md | 49 + docs/source/doc/llvm.ee.ExecutionEngine.md | 52 + docs/source/doc/llvm.ee.GenericValue.md | 55 + docs/source/doc/llvm.ee.TargetData.md | 48 + .../doc/llvm.passes.FunctionPassManager.md | 35 + docs/source/doc/llvm.passes.PassManager.md | 23 + .../doc/llvm.passes.PassManagerBuilder.md | 59 + docs/source/doc/llvm_concepts.md | 242 +++ docs/source/doc/types.md | 146 ++ docs/source/doc/userguide.md | 159 ++ docs/source/doc/values.md | 88 + docs/source/index.rst | 22 + 48 files changed, 12647 insertions(+) create mode 100644 docs/Makefile create mode 100755 docs/gh-pages.py create mode 100644 docs/source/conf.py create mode 100644 docs/source/doc/comparision.md create mode 100644 docs/source/doc/comparision.rst create mode 100644 docs/source/doc/examples.md create mode 100644 docs/source/doc/examples/JITTutorial1.md create mode 100644 docs/source/doc/examples/JITTutorial2.md create mode 100644 docs/source/doc/functions.md create mode 100644 docs/source/doc/kaleidoscope/PythonLangImpl1.md create mode 100644 docs/source/doc/kaleidoscope/PythonLangImpl2.md create mode 100644 docs/source/doc/kaleidoscope/PythonLangImpl3.md create mode 100644 docs/source/doc/kaleidoscope/PythonLangImpl4.md create mode 100644 docs/source/doc/kaleidoscope/PythonLangImpl5.md create mode 100644 docs/source/doc/kaleidoscope/PythonLangImpl6.md create mode 100644 docs/source/doc/kaleidoscope/PythonLangImpl7.md create mode 100644 docs/source/doc/kaleidoscope/PythonLangImpl8.md create mode 100644 docs/source/doc/llvm-py_package.md create mode 100644 docs/source/doc/llvm.core.Argument.md create mode 100644 docs/source/doc/llvm.core.ArrayType.md create mode 100644 docs/source/doc/llvm.core.BasicBlock.md create mode 100644 docs/source/doc/llvm.core.Builder.md create mode 100644 docs/source/doc/llvm.core.Constant.md create mode 100644 docs/source/doc/llvm.core.Function.md create mode 100644 docs/source/doc/llvm.core.FunctionType.md create mode 100644 docs/source/doc/llvm.core.GlobalValue.md create mode 100644 docs/source/doc/llvm.core.GlobalVariable.md create mode 100644 docs/source/doc/llvm.core.Instruction.md create mode 100644 docs/source/doc/llvm.core.IntegerType.md create mode 100644 docs/source/doc/llvm.core.Module.md create mode 100644 docs/source/doc/llvm.core.PointerType.md create mode 100644 docs/source/doc/llvm.core.StructType.md create mode 100644 docs/source/doc/llvm.core.Type.md create mode 100644 docs/source/doc/llvm.core.User.md create mode 100644 docs/source/doc/llvm.core.Value.md create mode 100644 docs/source/doc/llvm.core.VectorType.md create mode 100644 docs/source/doc/llvm.ee.EngineBuilder.md create mode 100644 docs/source/doc/llvm.ee.ExecutionEngine.md create mode 100644 docs/source/doc/llvm.ee.GenericValue.md create mode 100644 docs/source/doc/llvm.ee.TargetData.md create mode 100644 docs/source/doc/llvm.passes.FunctionPassManager.md create mode 100644 docs/source/doc/llvm.passes.PassManager.md create mode 100644 docs/source/doc/llvm.passes.PassManagerBuilder.md create mode 100644 docs/source/doc/llvm_concepts.md create mode 100644 docs/source/doc/types.md create mode 100644 docs/source/doc/userguide.md create mode 100644 docs/source/doc/values.md create mode 100644 docs/source/index.rst diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..8e563cc --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,157 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build +SRCDIR = source + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SRCDIR) +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/llvmpy.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/llvmpy.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/llvmpy" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/llvmpy" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +gh-pages: clean html + python gh-pages.py diff --git a/docs/gh-pages.py b/docs/gh-pages.py new file mode 100755 index 0000000..d9c8614 --- /dev/null +++ b/docs/gh-pages.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python +"""Script to commit the doc build outputs into the github-pages repo. + +Use: + + gh-pages.py [tag] + +If no tag is given, the current output of 'git describe' is used. If given, +that is how the resulting directory will be named. + +In practice, you should use either actual clean tags from a current build or +something like 'current' as a stable URL for the most current version of the """ + +#----------------------------------------------------------------------------- +# Imports +#----------------------------------------------------------------------------- +import os +import re +import shutil +import sys +from os import chdir as cd +from os.path import join as pjoin + +from subprocess import Popen, PIPE, CalledProcessError, check_call + +#----------------------------------------------------------------------------- +# Globals +#----------------------------------------------------------------------------- + +pages_dir = 'gh-pages' +html_dir = '_build/html' +pdf_dir = '_build/latex' +pages_repo = 'git@github.com:llvmpy/llvmpy-doc.git' + +#----------------------------------------------------------------------------- +# Functions +#----------------------------------------------------------------------------- +def sh(cmd): + """Execute command in a subshell, return status code.""" + return check_call(cmd, shell=True) + + +def sh2(cmd): + """Execute command in a subshell, return stdout. + + Stderr is unbuffered from the subshell.x""" + p = Popen(cmd, stdout=PIPE, shell=True) + out = p.communicate()[0] + retcode = p.returncode + if retcode: + raise CalledProcessError(retcode, cmd) + else: + return out.rstrip() + + +def sh3(cmd): + """Execute command in a subshell, return stdout, stderr + + If anything appears in stderr, print it out to sys.stderr""" + p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) + out, err = p.communicate() + retcode = p.returncode + if retcode: + raise CalledProcessError(retcode, cmd) + else: + return out.rstrip(), err.rstrip() + + +def init_repo(path): + """clone the gh-pages repo if we haven't already.""" + sh("git clone %s %s"%(pages_repo, path)) + here = os.getcwdu() + cd(path) + sh('git checkout gh-pages') + cd(here) + +#----------------------------------------------------------------------------- +# Script starts +#----------------------------------------------------------------------------- +if __name__ == '__main__': + # The tag can be given as a positional argument + try: + tag = sys.argv[1] + except IndexError: + try: + tag = sh2('git describe --exact-match') + except CalledProcessError: + tag = "dev" # Fallback + + startdir = os.getcwdu() + if not os.path.exists(pages_dir): + # init the repo + init_repo(pages_dir) + else: + # ensure up-to-date before operating + cd(pages_dir) + sh('git checkout gh-pages') + sh('git pull') + cd(startdir) + + dest = pjoin(pages_dir, tag) + + # don't `make html` here, because gh-pages already depends on html in Makefile + # sh('make html') + if tag != 'dev': + # only build pdf for non-dev targets + #sh2('make pdf') + pass + + # This is pretty unforgiving: we unconditionally nuke the destination + # directory, and then copy the html tree in there + shutil.rmtree(dest, ignore_errors=True) + shutil.copytree(html_dir, dest) + if tag != 'dev': + #shutil.copy(pjoin(pdf_dir, 'ipython.pdf'), pjoin(dest, 'ipython.pdf')) + pass + + try: + cd(pages_dir) + status = sh2('git status | head -1') + branch = re.match('\# On branch (.*)$', status).group(1) + if branch != 'gh-pages': + e = 'On %r, git branch is %r, MUST be "gh-pages"' % (pages_dir, + branch) + raise RuntimeError(e) + + sh('git add -A %s' % tag) + sh('git commit -m"Updated doc release: %s"' % tag) + print + print 'Most recent 3 commits:' + sys.stdout.flush() + sh('git --no-pager log --oneline HEAD~3..') + finally: + cd(startdir) + + print + print 'Now verify the build in: %r' % dest + print "If everything looks good, 'git push'" diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..b7a7342 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- +# +# llvmpy documentation build configuration file, created by +# sphinx-quickstart on Wed Aug 8 17:33:58 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.mathjax'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'llvmpy' +copyright = u'2012, Mahadevan R (2008-2010), Continuum Analytics (2012)' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.8' +# The full version, including alpha/beta/rc tags. +release = '0.8.2' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'llvmpydoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'llvmpy.tex', u'llvmpy Documentation', + u'Mahadevan R (2008-2010), Continuum Analytics (2012)', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'llvmpy', u'llvmpy Documentation', + [u'Mahadevan R (2008-2010), Continuum Analytics (2012)'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'llvmpy', u'llvmpy Documentation', + u'Mahadevan R (2008-2010), Continuum Analytics (2012)', 'llvmpy', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' diff --git a/docs/source/doc/comparision.md b/docs/source/doc/comparision.md new file mode 100644 index 0000000..43073c7 --- /dev/null +++ b/docs/source/doc/comparision.md @@ -0,0 +1,119 @@ +--- +layout: page +title: Comparison Operations +--- + +# Integer Comparision # {#icmp} + +Predicates for use with `icmp` instruction are listed below. All +of these are integer constants defined in the `llvm.core` module. + +##`ICMP_EQ` + +Equality + +##`ICMP_NE` + +Inequality + +##`ICMP_UGT` + +Unsigned greater than + +##`ICMP_UGE` + +Unsigned greater than or equal + +##`ICMP_ULT` + +Unsigned less than + +##`ICMP_ULE` + +Unsigned less than or equal + +##`ICMP_SGT` + +Signed greater than + +##`ICMP_SGE` + +Signed greater than or equal + +##`ICMP_SLT` + +Signed less than + +##`ICMP_SLE` + +Signed less than or equal + +# Float Comparision # {#fcmp} + +Predicates for use with `fcmp` instruction are listed below. All +of these are integer constants defined in the `llvm.core` module. + +##`FCMP_FALSE` + +Always false + +##`FCMP_OEQ` + +True if ordered and equal + +##`FCMP_OGT` + +True if ordered and greater than + +##`FCMP_OGE` + +True if ordered and greater than or equal + +##`FCMP_OLT` + +True if ordered and less than + +##`FCMP_OLE` + +True if ordered and less than or equal + +##`FCMP_ONE` + +True if ordered and operands are unequal + +##`FCMP_ORD` + +True if ordered (no NaNs) + +##`FCMP_UNO` + +True if unordered: `isnan(X) | isnan(Y)` + +##`FCMP_UEQ` + +True if unordered or equal + +##`FCMP_UGT` + +True if unordered or greater than + +##`FCMP_UGE` + +True if unordered, greater than or equal + +##`FCMP_ULT` + +True if unordered, or less than + +##`FCMP_ULE` + +True if unordered, less than or equal + +##`FCMP_UNE` + +True if unordered or not equal + +##`FCMP_TRUE` + +Always true + diff --git a/docs/source/doc/comparision.rst b/docs/source/doc/comparision.rst new file mode 100644 index 0000000..2f1b87b --- /dev/null +++ b/docs/source/doc/comparision.rst @@ -0,0 +1,147 @@ ++--------------------------------+ +| layout: page | ++--------------------------------+ +| title: Comparison Operations | ++--------------------------------+ + +Integer Comparision # {#icmp} +============================= + +Predicates for use with ``icmp`` instruction are listed below. All of +these are integer constants defined in the ``llvm.core`` module. + +``ICMP_EQ`` +----------- + +Equality + +``ICMP_NE`` +----------- + +Inequality + +``ICMP_UGT`` +------------ + +Unsigned greater than + +``ICMP_UGE`` +------------ + +Unsigned greater than or equal + +``ICMP_ULT`` +------------ + +Unsigned less than + +``ICMP_ULE`` +------------ + +Unsigned less than or equal + +``ICMP_SGT`` +------------ + +Signed greater than + +``ICMP_SGE`` +------------ + +Signed greater than or equal + +``ICMP_SLT`` +------------ + +Signed less than + +``ICMP_SLE`` +------------ + +Signed less than or equal + +Float Comparision # {#fcmp} +=========================== + +Predicates for use with ``fcmp`` instruction are listed below. All of +these are integer constants defined in the ``llvm.core`` module. + +``FCMP_FALSE`` +-------------- + +Always false + +``FCMP_OEQ`` +------------ + +True if ordered and equal + +``FCMP_OGT`` +------------ + +True if ordered and greater than + +``FCMP_OGE`` +------------ + +True if ordered and greater than or equal + +``FCMP_OLT`` +------------ + +True if ordered and less than + +``FCMP_OLE`` +------------ + +True if ordered and less than or equal + +``FCMP_ONE`` +------------ + +True if ordered and operands are unequal + +``FCMP_ORD`` +------------ + +True if ordered (no NaNs) + +``FCMP_UNO`` +------------ + +True if unordered: ``isnan(X) | isnan(Y)`` + +``FCMP_UEQ`` +------------ + +True if unordered or equal + +``FCMP_UGT`` +------------ + +True if unordered or greater than + +``FCMP_UGE`` +------------ + +True if unordered, greater than or equal + +``FCMP_ULT`` +------------ + +True if unordered, or less than + +``FCMP_ULE`` +------------ + +True if unordered, less than or equal + +``FCMP_UNE`` +------------ + +True if unordered or not equal + +``FCMP_TRUE`` +------------- + +Always true diff --git a/docs/source/doc/examples.md b/docs/source/doc/examples.md new file mode 100644 index 0000000..337f09a --- /dev/null +++ b/docs/source/doc/examples.md @@ -0,0 +1,154 @@ +--- +layout: page +title: Examples and LLVM Tutorials +--- + +* This will become a table of contents (this text will be scraped). +{:toc} + +# Examples + +## A Simple Function + +Let's create a (LLVM) module containing a single function, corresponding +to the `C` function: + +{% highlight c %} +int sum(int a, int b) +{ + return a + b; +} +{% endhighlight %} + +Here's how it looks like: + +{% highlight python %} +#!/usr/bin/env python + +# Import the llvm-py modules. +from llvm import * +from llvm.core import * + +# Create an (empty) module. +my_module = Module.new('my_module') + +# All the types involved here are "int"s. This type is represented +# by an object of the llvm.core.Type class: +ty_int = Type.int() # by default 32 bits + +# We need to represent the class of functions that accept two integers +# and return an integer. This is represented by an object of the +# function type (llvm.core.FunctionType): +ty_func = Type.function(ty_int, [ty_int, ty_int]) + +# Now we need a function named 'sum' of this type. Functions are not +# free-standing (in llvm-py); it needs to be contained in a module. +f_sum = my_module.add_function(ty_func, "sum") + +# Let's name the function arguments as 'a' and 'b'. +f_sum.args[0].name = "a" +f_sum.args[1].name = "b" + +# Our function needs a "basic block" -- a set of instructions that +# end with a terminator (like return, branch etc.). By convention +# the first block is called "entry". +bb = f_sum.append_basic_block("entry") + +# Let's add instructions into the block. For this, we need an +# instruction builder: +builder = Builder.new(bb) + +# OK, now for the instructions themselves. We'll create an add +# instruction that returns the sum as a value, which we'll use +# a ret instruction to return. +tmp = builder.add(f_sum.args[0], f_sum.args[1], "tmp") +builder.ret(tmp) + +# We've completed the definition now! Let's see the LLVM assembly +# language representation of what we've created: +print my_module +{% endhighlight %} + +Here is the output: + +{% highlight llvm %} +; ModuleID = 'my_module' + +define i32 @sum(i32 %a, i32 %b) { +entry: + %tmp = add i32 %a, %b ; [#uses=1] + ret i32 %tmp +} +{% endhighlight %} + + +## Adding JIT Compilation + +Let's compile this function in-memory and run it. + +{% highlight python %} +#!/usr/bin/env python + +# Import the llvm-py modules. +from llvm import * +from llvm.core import * +from llvm.ee import * # new import: ee = Execution Engine + +# Create a module, as in the previous example. +my_module = Module.new('my_module') +ty_int = Type.int() # by default 32 bits +ty_func = Type.function(ty_int, [ty_int, ty_int]) +f_sum = my_module.add_function(ty_func, "sum") +f_sum.args[0].name = "a" +f_sum.args[1].name = "b" +bb = f_sum.append_basic_block("entry") +builder = Builder.new(bb) +tmp = builder.add(f_sum.args[0], f_sum.args[1], "tmp") +builder.ret(tmp) + +# Create an execution engine object. This will create a JIT compiler +# on platforms that support it, or an interpreter otherwise. +ee = ExecutionEngine.new(my_module) + +# The arguments needs to be passed as "GenericValue" objects. +arg1 = GenericValue.int(ty_int, 100) +arg2 = GenericValue.int(ty_int, 42) + +# Now let's compile and run! +retval = ee.run_function(f_sum, [arg1, arg2]) + +# The return value is also GenericValue. Let's print it. +print "returned", retval.as_int() +{% endhighlight %} + +And here's the output: + + returned 142 + +* * * + +# LLVM Tutorials + +## Simple JIT Tutorials + +The following JIT tutorials were contributed by Sebastien Binet. + +1. [A First Function](examples/JITTutorial1.html) +2. [A More Complicated Function](examples/JITTutorial2.html) + +## Kaleidoscope ## {#kaleidoscope} + +Implementing a Language with LLVM + +The LLVM [Kaleidoscope](http://www.llvm.org/docs/tutorial/) tutorial +has been ported to llvm-py by Max Shawabkeh. + +1. [Tutorial Introduction and the Lexer](kaleidoscope/PythonLangImpl1.html) +2. [Implementing a Parser and AST](kaleidoscope/PythonLangImpl2.html) +3. [Implementing Code Generation to LLVM IR](kaleidoscope/PythonLangImpl3.html) +4. [Adding JIT and Optimizer Support](kaleidoscope/PythonLangImpl4.html) +5. [Extending the language: control flow](kaleidoscope/PythonLangImpl5.html) +6. [Extending the language: user-defined operators](kaleidoscope/PythonLangImpl6.html) +7. [Extending the language: mutable variables / SSA construction](kaleidoscope/PythonLangImpl7.html) +8. [Conclusion and other useful LLVM tidbits](kaleidoscope/PythonLangImpl8.html) + diff --git a/docs/source/doc/examples/JITTutorial1.md b/docs/source/doc/examples/JITTutorial1.md new file mode 100644 index 0000000..32186de --- /dev/null +++ b/docs/source/doc/examples/JITTutorial1.md @@ -0,0 +1,38 @@ +--- +layout: page +title: JIT Tutorial 1 +--- + +{% highlight python %} +#!/usr/bin/env python + +from llvm.core import * + +# create a module +module = Module.new ("tut1") + +# create a function type taking 3 32-bit integers, return a 32-bit integer +ty_int = Type.int (32) +func_type = Type.function (ty_int, (ty_int,)*3) + +# create a function of that type +mul_add = Function.new (module, func_type, "mul_add") +mul_add.calling_convention = CC_C +x = mul_add.args[0]; x.name = "x" +y = mul_add.args[1]; y.name = "y" +z = mul_add.args[2]; z.name = "z" + +# implement the function + +# new block +blk = mul_add.append_basic_block ("entry") + +# IR builder +bldr = Builder.new (blk) +tmp_1 = bldr.mul (x, y, "tmp_1") +tmp_2 = bldr.add (tmp_1, z, "tmp_2") + +bldr.ret (tmp_2) + +print(module) +{% endhighlight %} diff --git a/docs/source/doc/examples/JITTutorial2.md b/docs/source/doc/examples/JITTutorial2.md new file mode 100644 index 0000000..7ce36d7 --- /dev/null +++ b/docs/source/doc/examples/JITTutorial2.md @@ -0,0 +1,57 @@ +--- +layout: page +title: JIT Tutorial 2 +--- + +{% highlight python %} +#!/usr/bin/env python + +from llvm.core import * + +# create a module +module = Module.new ("tut2") + +# create a function type taking 2 integers, return a 32-bit integer +ty_int = Type.int (32) +func_type = Type.function (ty_int, (ty_int, ty_int)) + +# create a function of that type +gcd = Function.new (module, func_type, "gcd") + +# name function args +x = gcd.args[0]; x.name = "x" +y = gcd.args[1]; y.name = "y" + +# implement the function + +# blocks... +entry = gcd.append_basic_block ("entry") +ret = gcd.append_basic_block ("return") +cond_false = gcd.append_basic_block ("cond_false") +cond_true = gcd.append_basic_block ("cond_true") +cond_false_2 = gcd.append_basic_block ("cond_false_2") + +# create a llvm::IRBuilder +bldr = Builder.new (entry) +x_eq_y = bldr.icmp (IPRED_EQ, x, y, "tmp") +bldr.cbranch (x_eq_y, ret, cond_false) + +bldr.position_at_end (ret) +bldr.ret(x) + +bldr.position_at_end (cond_false) +x_lt_y = bldr.icmp (IPRED_ULT, x, y, "tmp") +bldr.cbranch (x_lt_y, cond_true, cond_false_2) + +bldr.position_at_end (cond_true) +y_sub_x = bldr.sub (y, x, "tmp") +recur_1 = bldr.call (gcd, (x, y_sub_x,), "tmp") +bldr.ret (recur_1) + +bldr.position_at_end (cond_false_2) +x_sub_y = bldr.sub (x, y, "x_sub_y") +recur_2 = bldr.call (gcd, (x_sub_y, y,), "tmp") +bldr.ret (recur_2) + +print(module) +{% endhighlight %} diff --git a/docs/source/doc/functions.md b/docs/source/doc/functions.md new file mode 100644 index 0000000..0f88262 --- /dev/null +++ b/docs/source/doc/functions.md @@ -0,0 +1,198 @@ +--- +layout: page +title: Functions +--- + +Functions are represented by [llvm.core.Function][] objects. +They are contained within modules, and can be created either with the method +`module_obj.add_function` or the static constructor `Function.new`. +References to functions already present in a module can be retrieved via +`module.get_function_named` or by the static constructor method +`Function.get`. All functions in a module can be enumerated by iterating +over `module_obj.functions`. + +{% highlight python %} +# create a type, representing functions that take an integer and return +# a floating point value. +ft = Type.function( Type.float(), [ Type.int() ] ) + +# create a function of this type +f1 = module_obj.add_function(ft, "func1") + +# or equivalently, like this: +f2 = Function.new(module_obj, ft, "func2") + +# get a reference to an existing function +f3 = module_obj.get_function_named("func3") + +# or like this: +f4 = Function.get(module_obj, "func4") + +# list all function names in a module +for f in module_obj.functions: + print f.name +{% endhighlight %} + + +# Intrinsic + +References to intrinsic functions can be got via the static constructor +`intrinsic`. This returns a `Function` object, calling which is +equivalent to invoking the intrinsic. The `intrinsic` method has to be +called with a module object, an intrinsic ID (which is a numeric +constant) and a list of the types of arguments (which LLVM uses to +resolve overloaded intrinsic functions). + +{% highlight python %} +# get a reference to the llvm.bswap intrinsic +bswap = Function.intrinsic(mod, INTR_BSWAP, [Type.int()]) + +# call it +builder.call(bswap, [value]) +{% endhighlight %} + +Here, the constant `INTR_BSWAP`, available from `llvm.core`, represents the +LLVM intrinsic [llvm.bswap](http://www.llvm.org/docs/LangRef.html#int_bswap). +The `[Type.int()]` selects the version of `llvm.bswap` that has a single 32-bit +integer argument. The list of intrinsic IDs defined as integer constants +in `llvm.core`. These are: + +{% include intrinsics.csv %} + +There are also target-specific intrinsics (which correspond to that +target's CPU instructions) available, but are omitted here for brevity. +Full list can be seen from +[_intrinsic_ids.py](https://github.com/numba/llvm-py/blob/master/llvm/_intrinsic_ids.py). +See the [LLVM Language Reference](http://www.llvm.org/docs/LangRef.html) +for more information on the intrinsics, and the +[test](https://github.com/numba/llvm-py/blob/master/test/intrinsic.py) +directory in the source distribution for more examples. The intrinsic ID +can be retrieved from a function object with the read-only property +`intrinsic_id`. + +> **Auto-generation of Intrinsic IDs** +> +> +> A script (tool/intrgen.py in source tree) generates the intrinsic IDs +> automatically. This is necessary when compiling llvm-py with a different +> version of LLVM. + +# Calling Convention # {#callconv} +The function's calling convention can be set using the +`calling_convention` property. The following (integer) constants defined +in `llvm.core` can be used as values: + +Value | Equivalent LLVM Assembly Keyword | +------|----------------------------------| +`CC_C` | `ccc` | +`CC_FASTCALL` | `fastcc` | +`CC_COLDCALL` | `coldcc` | +`CC_X86_STDCALL` | `x86_stdcallcc` | +`CC_X86_FASTCALL` | `x86_fastcallcc` | + +See the [LLVM docs](http://www.llvm.org/docs/LangRef.html#callingconv) for +more information on each. Backend-specific numbered conventions can be +directly passed as integers. + +An arbitrary string identifying which garbage collector to use can be +set or got with the property `collector`. + +The value objects corresponding to the arguments of a function can be +got using the read-only property `args`. These can be iterated over, and +also be indexed via integers. An example: + +{% highlight python %} +# list all argument names and types +for arg in fn.args: + print arg.name, "of type", arg.type + +# change the name of the first argument +fn.args[0].name = "objptr" +{% endhighlight %} + +Basic blocks (see later) are contained within functions. When newly +created, a function has no basic blocks. They have to be added +explicitly, using the `append_basic_block` method, which adds a new, +empty basic block as the last one in the function. The first basic block +of the function can be retrieved using the `get_entry_basic_block` +method. The existing basic blocks can be enumerated by iterating over +using the read-only property `basic_blocks`. The number of basic blocks +can be got via `basic_block_count` method. Note that +`get_entry_basic_block` is slightly faster than `basic_blocks[0]` and so +is `basic_block_count`, over `len(f.basic_blocks)`. + +{% highlight python %} +# add a basic block +b1 = fn.append_basic_block("entry") + +# get the first one +b2 = fn.get_entry_basic_block() +b2 = fn.basic_mdblocks[0] # slower than previous method + +# print names of all basic blocks +for b in fn.basic_blocks: + print b.name + +# get number of basic blocks +n = fn.basic_block_count +n = len(fn.basic_blocks) # slower than previous method +{% endhighlight %} + +Functions can be deleted using the method `delete`. This deletes them +from their containing module. All references to the function object +should be dropped after `delete` has been called. + +Functions can be verified with the `verify` method. Note that this may +not work properly (aborts on errors). + +# Function Attributes # {#fnattr} +Function attributes, as documented +[here](http://www.llvm.org/docs/LangRef.html#fnattrs), can be +set on functions using the methods `add_attribute` and +`remove_attribute`. The following values may be used to refer to the +LLVM attributes: + + +Value | Equivalent LLVM Assembly Keyword | +------|----------------------------------| +`ATTR_ALWAYS_INLINE`|`alwaysinline` | +`ATTR_INLINE_HINT`|`inlinehint` | +`ATTR_NO_INLINE`|`noinline` | +`ATTR_OPTIMIZE_FOR_SIZE`|`optsize` | +`ATTR_NO_RETURN`|`noreturn` | +`ATTR_NO_UNWIND`|`nounwind` | +`ATTR_READ_NONE`|`readnone` | +`ATTR_READONLY`|`readonly` | +`ATTR_STACK_PROTECT`|`ssp` | +`ATTR_STACK_PROTECT_REQ`|`sspreq` | +`ATTR_NO_REDZONE`|`noredzone` | +`ATTR_NO_IMPLICIT_FLOAT`|`noimplicitfloat` | +`ATTR_NAKED`|`naked` | + + +Here is how attributes can be set and removed: + +{% highlight python %} +# create a function +ti = Type.int(32) +tf = Type.function(ti, [ti, ti]) +m = Module.new('mod') +f = m.add_function(tf, 'sum') +print f +# declare i32 @sum(i32, i32) + +# add a couple of attributes +f.add_attribute(ATTR_NO_UNWIND) +f.add_attribute(ATTR_READONLY) +print f +# declare i32 @sum(i32, i32) nounwind readonly +{% endhighlight %} + +**Related Links** + +[llvm.core.Function][], +[llvm.core.Argument][] + +[llvm.core.Function]: llvm.core.Function.html +[llvm.core.Argument]: llvm.core.Argument.html + diff --git a/docs/source/doc/kaleidoscope/PythonLangImpl1.md b/docs/source/doc/kaleidoscope/PythonLangImpl1.md new file mode 100644 index 0000000..4502432 --- /dev/null +++ b/docs/source/doc/kaleidoscope/PythonLangImpl1.md @@ -0,0 +1,330 @@ +--- +layout: page +title: "Kaleidoscope: Chapter 1" +--- + +# Tutorial Introduction and the Lexer + +Written by [Chris Lattner](mailto:sabre@nondot.org) +and [Max Shawabkeh](http://max99x.com) + +**Chapter 1** + + +* This will become a table of contents (this text will be scraped). +{:toc} + +[**Chapter 2: Implementing a Parser and AST**](PythonLangImpl2.html) + +# Introduction + +Welcome to the "Implementing a language with LLVM" tutorial. This tutorial +runs through the implementation of a simple language, showing how fun and +easy it can be. This tutorial will get you up and started as well as help to +build a framework you can extend to other languages. The code in this +tutorial can also be used as a playground to hack on other LLVM specific +things. + +It is useful to point out ahead of time that this tutorial is really about +teaching compiler techniques and LLVM specifically, *not* about teaching +modern and sane software engineering principles. In practice, this means that +we'll take a number of shortcuts to simplify the exposition. If you dig in and +use the code as a basis for future projects, fixing its deficiencies shouldn't +be hard. + + +We've tried to put this tutorial together in a way that makes chapters easy +to skip over if you are already familiar with or are uninterested in the +various pieces. The structure of the tutorial is: + +* **[Chapter 1](#language): Introduction to the Kaleidoscope language, +and the definition of its Lexer** -- This shows where we are going +and the basic functionality that we want it to do. In order to make this +tutorial maximally understandable and hackable, we choose to implement +everything in Python instead of using lexer and parser generators. LLVM +obviously works just fine with such tools, feel free to use one if you prefer. + +* **[Chapter 2](PythonLangImpl2.html): Implementing a Parser and AST** -- +With the lexer in place, we can talk about parsing techniques and +basic AST construction. This tutorial describes recursive descent parsing and +operator precedence parsing. Nothing in Chapters 1 or 2 is LLVM-specific, +the code doesn't even import the LLVM modules at this point. :) + +* **[Chapter 3](PythonLangImpl3.html): Code generation to LLVM IR** -- With +the AST ready, we can show off how easy generation of LLVM IR really is. + +* **[Chapter 4](PythonLangImpl4.html): Adding JIT and Optimizer support** -- +Because a lot of people are interested in using LLVM as a JIT, +we'll dive right into it and show you the 3 lines it takes to add JIT support. +LLVM is also useful in many other ways, but this is one simple and "sexy" way +to shows off its power. :) + +* **[Chapter 5](PythonLangImpl5.html): Extending the Language: Control Flow** + -- With the language up and running, we show how to extend it +with control flow operations (if/then/else and a 'for' loop). This gives us a +chance to talk about simple SSA construction and control flow. + +* **[Chapter 6](PythonLangImpl6.html): Extending the Language: +User-defined Operators** -- This is a silly but fun chapter that talks about +extending the language to let the user program define their own arbitrary +unary and binary operators (with assignable precedence!). This lets us build +a significant piece of the "language" as library routines. + +* **[Chapter 7](PythonLangImpl7.html): Extending the Language: +Mutable Variables** -- This chapter talks about adding user-defined local +variables along with an assignment operator. The interesting part about this +is how easy and trivial it is to construct SSA form in LLVM: no, LLVM does +*not* require your front-end to construct SSA form! + +* **[Chapter 8](PythonLangImpl8.html): Conclusion and other +useful LLVM tidbits** -- This chapter wraps up the series by talking about +potential ways to extend the language, but also includes a bunch of pointers to +info about "special topics" like adding garbage collection support, exceptions, +debugging, support for "spaghetti stacks", and a bunch of other tips and +tricks. + +By the end of the tutorial, we'll have written a bit less than 540 lines of +non-comment, non-blank, lines of code. With this small amount of code, we'll +have built up a very reasonable compiler for a non-trivial language including +a hand-written lexer, parser, AST, as well as code generation support with a JIT +compiler. While other systems may have interesting "hello world" tutorials, +I think the breadth of this tutorial is a great testament to the strengths of +LLVM and why you should consider it if you're interested in language or compiler +design. + +A note about this tutorial: we expect you to extend the language and play +with it on your own. Take the code and go crazy hacking away at it, compilers +don't need to be scary creatures - it can be a lot of fun to play with +languages! + +* * * + +# The Basic Language # {#language} + +This tutorial will be illustrated with a toy language that we'll call +"[Kaleidoscope](http://en.wikipedia.org/wiki/Kaleidoscope)" (derived +from "meaning beautiful, form, and view"). +Kaleidoscope is a procedural language that allows you to define functions, use +conditionals, math, etc. Over the course of the tutorial, we'll extend +Kaleidoscope to support the if/then/else construct, a for loop, user defined +operators, JIT compilation with a simple command line interface, etc. + +Because we want to keep things simple, the only datatype in Kaleidoscope is a +64-bit floating point type. As such, all values are implicitly double precision +and the language doesn't require type declarations. This gives the language a +very nice and simple syntax. For example, the following simple example computes +[Fibonacci numbers](http://en.wikipedia.org/wiki/Fibonacci_number): + +{% highlight python %} +# Compute the x'th fibonacci number. +def fib(x) + if x < 3 then + 1 + else + fib(x-1)+fib(x-2) + +# This expression will compute the 40th number. +fib(40) +{% endhighlight %} + + +We also allow Kaleidoscope to call into standard library functions (the LLVM +JIT makes this completely trivial). This means that you can use the 'extern' +keyword to define a function before you use it (this is also useful for mutually +recursive functions). For example: + +{% highlight python %} +extern sin(arg); +extern cos(arg); +extern atan2(arg1 arg2); + +atan2(sin(0.4), cos(42)) +{% endhighlight %} + + +A more interesting example is included in Chapter 6 where we write a little +Kaleidoscope application that [displays](PythonLangImpl6.html#example) +a Mandelbrot Set at various levels of magnification. + +Lets dive into the implementation of this language! + +* * * + +# The Lexer # {#lexer} + + +When it comes to implementing a language, the first thing needed is +the ability to process a text file and recognize what it says. +The traditional way to do this is to use a +[lexer](http://en.wikipedia.org/wiki/Lexical_analysis)" (aka 'scanner') +to break the input up into "tokens". Each token returned by the lexer includes +a token type and potentially some metadata (e.g. the numeric value of a number). +First, we define the possibilities: + + +{% highlight python %} +# The lexer yields one of these types for each token. +class EOFToken(object): + pass + +class DefToken(object): + pass + +class ExternToken(object): + pass + +class IdentifierToken(object): + def __init__(self, name): self.name = name + +class NumberToken(object): + def __init__(self, value): self.value = value + +class CharacterToken(object): + def __init__(self, char): self.char = char + def __eq__(self, other): + return isinstance(other, CharacterToken) and self.char == other.char + def __ne__(self, other): return not self == other +{% endhighlight %} + + +Each token yielded by our lexer will be of one of the above types. For simple +tokens that are always the same, like the "def" keyword, the lexer will yield +`DefToken()`>. Identifiers, numbers and characters, on the other +hand, have extra data, so when the lexer encounteres the number 123.45, it will +emit it as `NumberToken(123.45)`. An identifier `foo` will be +emitted as `IdentifierToken('foo')`. And finally, an unknown character +like '+' will be returned as `CharacterToken('+')`. You may notice that +we overload the equality and inequality operators for the characters; this will +later simplify character comparisons in the parser code. + +The actual implementation of the lexer is a single function called `Tokenize`, +which takes a string and +[yields](http://docs.python.org/reference/simple_stmts.html#the-yield-statement) +tokens. For simplicity, we will use +[regular expressions](http://docs.python.org/library/re.html) +to parse out the tokens. This is terribly inefficient, but +perfectly sufficient for our needs. + +First, we define the regular expressions for our tokens. Numbers and strings +of digits, optionally followed by a period and another string of digits. +Identifiers (and keywords) are alphanumeric string starting with a letter and +comments are anything between a hash (`#`) and the end of the line. + + +{% highlight python %} +import re + +... + +# Regular expressions that tokens and comments of our language. +REGEX_NUMBER = re.compile('[0-9]+(?:\.[0-9]+)?') +REGEX_IDENTIFIER = re.compile('[a-zA-Z][a-zA-Z0-9]*') +REGEX_COMMENT = re.compile('#.*') + +{% endhighlight %} + +Next, let's start defining the `Tokenize` function itself. The first +thing we need to do is set up a loop that scans the string, while ignoring +whitespace between tokens: + + +{% highlight python %} +def Tokenize(string): + while string: + # Skip whitespace. + if string[0].isspace(): + string = string[1:] + continue + + ... + +{% endhighlight %} + +Next we want to find out what the next token is. For this we run the regexes +we defined above on the remainder of the string. To simplify the rest of the +code, we run all three regexes each time. As mentioned above, inefficiencies are +ignored for the purpose of this tutorial: + + +{% highlight python %} + # Run regexes. + comment_match = REGEX_COMMENT.match(string) + number_match = REGEX_NUMBER.match(string) + identifier_match = REGEX_IDENTIFIER.match(string) +{% endhighlight %} + + +Now se check if any of the regexes matched. For comments, we simply +ignore the captured match: + + +{% highlight python %} + # Check if any of the regexes matched and yield the appropriate result. + if comment_match: + comment = comment_match.group(0) + string = string[len(comment):] +{% endhighlight python %} + +For numbers, we yield the captured match, converted to a float and tagged +with the appropriate token type: + + + +{% highlight python %} + elif number_match: + number = number_match.group(0) + yield NumberToken(float(number)) + string = string[len(number):] +{% endhighlight %} + +The identifier case is a little more complex. We have to check for keywords +to decide whether we have captured an identifier or a keyword: + + +{% highlight python %} + elif identifier_match: + identifier = identifier_match.group(0) + # Check if we matched a keyword. + if identifier == 'def': + yield DefToken() + elif identifier == 'extern': + yield ExternToken() + else: + yield IdentifierToken(identifier) + string = string[len(identifier):] +{% endhighlight %} + +Finally, if we haven't recognized a comment, a number of an identifier, we +yield the current character as an "unknown character" token. This is used, for +example, for operators like `+` or `*`: + + +{% highlight python %} + else: + # Yield the unknown character. + yield CharacterToken(string[0]) + string = string[1:] +{% endhighlight %} + +Once we're done with the loop, we return a final end-of-file token: + + +{% highlight python %} + yield EOFToken() +{% endhighlight %} + + +With this, we have the complete lexer for the basic Kaleidoscope language +(the [full code listing](PythonLangImpl2.html#code) for the Lexer is +available in the [next chapter](PythonLangImpl2.html) of the +tutorial). Next we'll [build a simple parser that +uses this to build an Abstract Syntax Tree](PythonLangImpl2.html). +When we have that, we'll +include a driver so that you can use the lexer and parser together. + +* * * + +**[Next: Implementing a Parser and AST](PythonLangImpl2.html)** + + + diff --git a/docs/source/doc/kaleidoscope/PythonLangImpl2.md b/docs/source/doc/kaleidoscope/PythonLangImpl2.md new file mode 100644 index 0000000..143894c --- /dev/null +++ b/docs/source/doc/kaleidoscope/PythonLangImpl2.md @@ -0,0 +1,998 @@ +--- +layout: page +title: "Kaleidoscope: Chapter 2" +--- + +# Implementing a Parser and AST + +Written by [Chris Lattner](mailto:sabre@nondot.org) +and [Max Shawabkeh](http://max99x.com) + + +**Chapter 2** + + +* This will become a table of contents (this text will be scraped). +{:toc} + + +**[Chapter 3: Code generation to LLVM IR](PythonLangImpl3.html)** + + +# Introduction # {#intro} + +Welcome to Chapter 2 of the +[Implementing a language with LLVM](http://www.llvm.org/docs/tutorial/index.html) +tutorial. +This chapter shows you how to use the lexer, built in +[Chapter 1](PythonLangImpl1.html), to build a full +[parser](http://en.wikipedia.org/wiki/Parsing) for +our Kaleidoscope language. Once we have a parser, we'll define and build an +[Abstract Syntax Tree](http://en.wikipedia.org/wiki/Abstract_syntax_tree) +(AST). + +The parser we will build uses a combination of [Recursive Descent +Parsing](http://en.wikipedia.org/wiki/Recursive_descent_parser) and +[Operator-Precedence Parsing](http://en.wikipedia.org/wiki/Operator-precedence_parser) +to parse the Kaleidoscope language (the latter for +binary expressions and the former for everything else). Before we get to +parsing though, lets talk about the output of the parser: the Abstract Syntax +Tree. + +* * * + + +# The Abstract Syntax Tree (AST) # {#ast} + +The AST for a program captures its behavior in such a way that it is easy for +later stages of the compiler (e.g. code generation) to interpret. We basically +want one object for each construct in the language, and the AST should closely +model the language. In Kaleidoscope, we have expressions, a prototype, and a +function object. We'll start with expressions first: + +{% highlight python %} +# Base class for all expression nodes. +class ExpressionNode(object): + pass + +# Expression class for numeric literals like "1.0". +class NumberExpressionNode(ExpressionNode): + def __init__(self, value): + self.value = value + +{% endhighlight %} + +The code above shows the definition of the base ExpressionNode class and one +subclass which we use for numeric literals. The important thing to note about +this code is that the NumberExpressionNode class captures the numeric value of +the literal as an instance variable. This allows later phases of the compiler to +know what the stored numeric value is. + +Right now we only create the AST, so there are no useful methods on them. +It would be very easy to add a virtual method to pretty print the code, for +example. Here are the other expression AST node definitions that we'll use +in the basic form of the Kaleidoscope language: + +{% highlight python %} +# Expression class for referencing a variable, like "a". +class VariableExpressionNode(ExpressionNode): + def __init__(self, name): + self.name = name + +# Expression class for a binary operator. +class BinaryOperatorExpressionNode(ExpressionNode): + def __init__(self, operator, left, right): + self.operator = operator + self.left = left + self.right = right + +# Expression class for function calls. +class CallExpressionNode(ExpressionNode): + def __init__(self, callee, args): + self.callee = callee + self.args = args +{% endhighlight %} + +This is all (intentionally) rather straight-forward: variables capture the +variable name, binary operators capture their opcode (e.g. '+'), and calls +capture a function name as well as a list of any argument expressions. One thing +that is nice about our AST is that it captures the language features without +talking about the syntax of the language. Note that there is no discussion about +precedence of binary operators, lexical structure, etc. + +For our basic language, these are all of the expression nodes we'll define. +Because it doesn't have conditional control flow, it isn't Turing-complete; +we'll fix that in a later installment. The two things we need next are a way +to talk about the interface to a function, and a way to talk about functions +themselves: + + +{% highlight python %} +# This class represents the "prototype" for a function, which captures its name, +# and its argument names (thus implicitly the number of arguments the function +# takes). +class PrototypeNode(object): + def __init__(self, name, args): + self.name = name + self.args = args + +# This class represents a function definition itself. +class FunctionNode(object): + def __init__(self, prototype, body): + self.prototype = prototype + self.body = body +{% endhighlight %} + +In Kaleidoscope, functions are typed with just a count of their arguments. +Since all values are double precision floating point, the type of each argument +doesn't need to be stored anywhere. In a more aggressive and realistic +language, the `ExpressionNode` class would probably have a type field. + +With this scaffolding, we can now talk about parsing expressions and function +bodies in Kaleidoscope. + +* * * + + +# Parser Basics # {#parserbasics} + +Now that we have an AST to build, we need to define the parser code to build +it. The idea here is that we want to parse something like `x + y` (which +is returned as three tokens by the lexer) into an AST that could be generated +with calls like this: + +{% highlight python %} + x = VariableExpressionNode('x') + y = VariableExpressionNode('y') + result = BinaryOperatorExpressionNode('+', x, y) +{% endhighlight %} + +In order to do this, we'll start by defining a lightweight `Parser` +class with some basic helper routines: + +{% highlight python %} +class Parser(object): + + def __init__(self, tokens, binop_precedence): + self.tokens = tokens + self.binop_precedence = binop_precedence + self.Next() + + # Provide a simple token buffer. Parser.current is the current token the + # parser is looking at. Parser.Next() reads another token from the lexer and + # updates Parser.current with its results. + def Next(self): + self.current = self.tokens.next() +{% endhighlight %} + + +This implements a simple token buffer around the lexer. This allows +us to look one token ahead at what the lexer is returning. Every function in +our parser will assume that `self.current` is the current token that +needs to be parsed. Note that the first token is read as soon as the parser is +instantiated. Let us ignore the `binop_precedence` parameter for now. It +will be explained when we start [parsing binary operators](#parserbinops). + +With these basic helper functions, we can implement the first +piece of our grammar: numeric literals. + +* * * + +# Basic Expression Parsing # {#parserprimexprs} + +We start with numeric literals, because they are the simplest to process. +For each production in our grammar, we'll define a function which parses that +production. For numeric literals, we have: + +{% highlight python %} + # numberexpr ::= number + def ParseNumberExpr(self): + result = NumberExpressionNode(self.current.value) + self.Next() # consume the number. + return result +{% endhighlight %} + + +This method is very simple: it expects to be called when the current token +is a `NumberToken`. It takes the current number value, creates a +`NumberExpressionNode`, advances to the next token, and finally returns. + + +There are some interesting aspects to this. The most important one is that +this routine eats all of the tokens that correspond to the production and +returns the lexer buffer with the next token (which is not part of the grammar +production) ready to go. This is a fairly standard way to go for recursive +descent parsers. For a better example, the parenthesis operator is defined like +this: + +{% highlight python %} + # parenexpr ::= '(' expression ')' + def ParseParenExpr(self): + self.Next() # eat '('. + + contents = self.ParseExpression() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")".') + self.Next() # eat ')'. + + return contents +{% endhighlight %} + + +This function illustrates an interesting aspect of the parser. The function +uses recursion by calling `ParseExpression` (we will soon see that +`ParseExpression` can call `ParseParenExpr`). This is powerful +because it allows us to handle recursive grammars, and keeps each production +very simple. Note that parentheses do not cause construction of AST nodes +themselves. While we could do it this way, the most important role of +parentheses are to guide the parser and provide grouping. Once the parser +constructs the AST, parentheses are not needed. + +The next simple production is for handling variable references and function +calls: + +{% highlight python %} + # identifierexpr ::= identifier | identifier '(' expression* ')' + def ParseIdentifierExpr(self): + identifier_name = self.current.name + self.Next() # eat identifier. + + if self.current != CharacterToken('('): # Simple variable reference. + return VariableExpressionNode(identifier_name); + + # Call. + self.Next() # eat '('. + args = [] + if self.current != CharacterToken(')'): + while True: + args.append(self.ParseExpression()) + if self.current == CharacterToken(')'): + break + elif self.current != CharacterToken(','): + raise RuntimeError('Expected ")" or "," in argument list.') + self.Next() + + self.Next() # eat ')'. + return CallExpressionNode(identifier_name, args) +{% endhighlight %} + + +This routine follows the same style as the other routines. It expects to be +called if the current token is an `IdentifierToken`. It also has +recursion and error handling. One interesting aspect of this is that it uses +*look-ahead* to determine if the current identifier is a stand alone +variable reference or if it is a function call expression. It handles this by +checking to see if the token after the identifier is a '(' token, constructing +either a `VariableExpressionNode` or `CallExpressionNode` as +appropriate. + +Now that we have all of our simple expression-parsing logic in place, we can +define a helper function to wrap it together into one entry point. We call this +class of expressions "primary" expressions, for reasons that will become more +clear [later in the tutorial](PythonLangImpl6.html#unary). In order +to parse an arbitrary primary expression, we need to determine what sort of +expression it is: + +{% highlight python %} + # primary ::= identifierexpr | numberexpr | parenexpr + def ParsePrimary(self): + if isinstance(self.current, IdentifierToken): + return self.ParseIdentifierExpr() + elif isinstance(self.current, NumberToken): + return self.ParseNumberExpr(); + elif self.current == CharacterToken('('): + return self.ParseParenExpr() + else: + raise RuntimeError('Unknown token when expecting an expression.') +{% endhighlight %} + + +Now that you see the definition of this function, it is more obvious why we +can assume the state of `Parser.current` in the various functions. This +uses look-ahead to determine which sort of expression is being inspected, and +then parses it with a function call. + +Now that basic expressions are handled, we need to handle binary expressions. +They are a bit more complex. + + + + +* * * + +# Binary Expression Parsing # {#parserbinops} + +Binary expressions are significantly harder to parse because they are often +ambiguous. For example, when given the string `x+y*z`, the parser can choose +to parse it as either `(x+y)*z` or `x+(y*z)`. With common definitions from +mathematics, we expect the later parse, because `*` (multiplication) has +higher *precedence* than `+` (addition). + +There are many ways to handle this, but an elegant and efficient way is +to use [Operator-Precedence Parsing](http://en.wikipedia.org/wiki/Operator-precedence_parser). +This parsing technique uses the precedence of binary operators to +guide recursion. To start with, we need a table of precedences. Remember the +`binop_precedence` parameter we passed to the `Parser` +constructor? Now is the time to use it: + +{% highlight python %} +def main(): + # Install standard binary operators. + # 1 is lowest possible precedence. 40 is the highest. + operator_precedence = { + '<': 10, + '+': 20, + '-': 20, + '*': 40 + } + + # Run the main `interpreter loop`. + while True: + + ... + + parser = Parser(Tokenize(raw), operator_precedence) + +{% endhighlight %} + +For the basic form of Kaleidoscope, we will only support 4 binary operators +(this can obviously be extended by you, our brave and intrepid reader). Having a +dictionary makes it easy to add new operators and makes it clear that the +algorithm doesn't depend on the specific operators involved, but it would be +easy enough to eliminate the map and hardcode the comparisons. + +We also define a helper function to get the precedence of the current token, +or -1 if the token is not a binary operator: + + +{% highlight python %} + # Gets the precedence of the current token, or -1 if the token is not a binary + # operator. + def GetCurrentTokenPrecedence(self): + if isinstance(self.current, CharacterToken): + return self.binop_precedence.get(self.current.char, -1) + else: + return -1 +{% endhighlight %} + +With the helper above defined, we can now start parsing binary expressions. +The basic idea of operator precedence parsing is to break down an expression +with potentially ambiguous binary operators into pieces. Consider, for example, +the expression `a+b+(c+d)*e*f+g`. Operator precedence parsing considers this +as a stream of primary expressions separated by binary operators. As such, +it will first parse the leading primary expression `a`, then it will see the +pairs `[+, b] [+, (c+d)] [*, e] [*, f] and [+, g]`. Note that because parentheses +are primary expressions, the binary expression parser doesn't need to worry +about nested subexpressions like (c+d) at all. + + + +To start, an expression is a primary expression potentially followed by a +sequence of `[binop,primaryexpr]` pairs: + +{% highlight python %} + # expression ::= primary binoprhs + def ParseExpression(self): + left = self.ParsePrimary() + return self.ParseBinOpRHS(left, 0) +{% endhighlight %} + +`ParseBinOpRHS` is the function that parses the sequence of pairs for +us. It takes a precedence and a pointer to an expression for the part that has +been parsed so far. Note that `x` is a perfectly valid expression: As such, +`binoprhs` is allowed to be empty, in which case it returns the expression that +is passed into it. In our example above, the code passes the expression for `a` +into `ParseBinOpRHS` and the current token is `+`. + +The precedence value passed into `ParseBinOpRHS` indicates the * +minimal operator precedence* that the function is allowed to eat. For +example, if the current pair stream is `[+, x]` and `ParseBinOpRHS` is +passed in a precedence of 40, it will not consume any tokens (because the +precedence of '+' is only 20). With this in mind, `ParseBinOpRHS` starts +with: + +{% highlight python %} + # binoprhs ::= (operator primary)* + def ParseBinOpRHS(self, left, left_precedence): + # If this is a binary operator, find its precedence. + while True: + precedence = self.GetCurrentTokenPrecedence() + + # If this is a binary operator that binds at least as tightly as the + # current one, consume it; otherwise we are done. + if precedence < left_precedence: + return left +{% endhighlight %} + +This code gets the precedence of the current token and checks to see if if is +too low. Because we defined invalid tokens to have a precedence of -1, this +check implicitly knows that the pair-stream ends when the token stream runs out +of binary operators. If this check succeeds, we know that the token is a binary +operator and that it will be included in this expression: + +{% highlight python %} + binary_operator = self.current.char + self.Next() # eat the operator. + + # Parse the primary expression after the binary operator. + right = self.ParsePrimary() +{% endhighlight %} + +As such, this code eats (and remembers) the binary operator and then parses +the primary expression that follows. This builds up the whole pair, the first of +which is `[+, b]` for the running example. + +Now that we parsed the left-hand side of an expression and one pair of the +RHS sequence, we have to decide which way the expression associates. In +particular, we could have `(a+b) binop unparsed` or `a + (b binop unparsed)`. +To determine this, we look ahead at `binop` to determine its precedence and +compare it to BinOp's precedence (which is '+' in this case): + + +{% highlight python %} + # If binary_operator binds less tightly with right than the operator after + # right, let the pending operator take right as its left. + next_precedence = self.GetCurrentTokenPrecedence() + if precedence < next_precedence: +{% endhighlight %} + +If the precedence of the binop to the right of `RHS` is lower or equal to the +precedence of our current operator, then we know that the parentheses associate +as `(a+b) binop ...`. In our example, the current operator is `+` and the next +operator is `+`, we know that they have the same precedence. In this case we'll +create the AST node for `a+b`, and then continue parsing: + +{% highlight python %} + if precedence < next_precedence: + ... if body omitted ... + + # Merge left/right. + left = BinaryOperatorExpressionNode(binary_operator, left, right); +{% endhighlight %} + +In our example above, this will turn `a+b+` into `(a+b)` and execute the next +iteration of the loop, with `+` as the current token. The code above will eat, +remember, and parse `(c+d)` as the primary expression, which makes the +current pair equal to `[+, (c+d)]`. It will then evaluate the 'if' conditional +above with `*` as the binop to the right of the primary. In this case, the +precedence of `*` is higher than the precedence of `+` so the if condition will +be entered. + +The critical question left here is `how can the if condition parse the right +hand side in full`? In particular, to build the AST correctly for our example, +it needs to get all of ` ( c + d ) * e * f` as the RHS expression variable. The code to +do this is surprisingly simple (code from the above two blocks duplicated for +context): + +{% highlight python %} + # If binary_operator binds less tightly with right than the operator after + # right, let the pending operator take right as its left. + next_precedence = self.GetCurrentTokenPrecedence() + if precedence < next_precedence: + right = self.ParseBinOpRHS(right, precedence + 1) + + # Merge left/right. + left = BinaryOperatorExpressionNode(binary_operator, left, right) +{% endhighlight %} + +At this point, we know that the binary operator to the RHS of our primary +has higher precedence than the binop we are currently parsing. As such, we know +that any sequence of pairs whose operators are all higher precedence than `+` +should be parsed together and returned as `RHS`. To do this, we recursively +invoke the `ParseBinOpRHS` function specifying `precedence + 1` as the +minimum precedence required for it to continue. In our example above, this +will cause it to return the AST node for `(c+d)*e*f` as RHS, which is then set +as the RHS of the '+' expression. + +Finally, on the next iteration of the while loop, the `+g` piece is parsed +and added to the AST. With this little bit of code (11 non-trivial lines), we +correctly handle fully general binary expression parsing in a very elegant way. +This was a whirlwind tour of this code, and it is somewhat subtle. I recommend +running through it with a few tough examples to see how it works. + + +This wraps up handling of expressions. At this point, we can point the +parser at an arbitrary token stream and build an expression from it, stopping +at the first token that is not part of the expression. Next up we need to +handle function definitions, etc. + + +* * * + +# Parsing the Rest # {#parsertop} + + +The next thing missing is handling of function prototypes. In Kaleidoscope, +these are used both for 'extern' function declarations as well as function body +definitions. The code to do this is straight-forward and not very interesting +(once you've survived expressions): + +{% highlight python %} + # prototype ::= id '(' id* ')' + def ParsePrototype(self): + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected function name in prototype.') + + function_name = self.current.name + self.Next() # eat function name. + + if self.current != CharacterToken('('): + raise RuntimeError('Expected "(" in prototype.') + self.Next() # eat '('. + + arg_names = [] + while isinstance(self.current, IdentifierToken): + arg_names.append(self.current.name) + self.Next() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")" in prototype.') + + # Success. + self.Next() # eat ')'. + + return PrototypeNode(function_name, arg_names) +{% endhighlight %} + + +Given this, a function definition is very simple, just a prototype plus +an expression to implement the body: + + +{% highlight python %} + # definition ::= 'def' prototype expression + def ParseDefinition(self): + self.Next() # eat def. + proto = self.ParsePrototype() + body = self.ParseExpression() + return FunctionNode(proto, body) +{% endhighlight %} + + +In addition, we support 'extern' to declare functions like 'sin' and 'cos' as +well as to support forward declaration of user functions. These 'extern's are +just prototypes with no body: + + +{% highlight python %} + # external ::= 'extern' prototype + def ParseExtern(self): + self.Next() # eat extern. + return self.ParsePrototype() +{% endhighlight %} + + +Finally, we'll also let the user type in arbitrary top-level expressions and +evaluate them on the fly. We will handle this by defining anonymous nullary +(zero argument) functions for them: + + +{% highlight python %} + # toplevelexpr ::= expression + def ParseTopLevelExpr(self): + proto = PrototypeNode('', []) + return FunctionNode(proto, self.ParseExpression()) +{% endhighlight %} + + +Now that we have all the pieces, let's build a little driver that will let us +actually *execute* this code we've built! + + + +* * * + +# The Driver # {#driver} + + +The driver for this simply invokes all of the parsing pieces with a top-level +dispatch loop. There isn't much interesting here, so I'll just include the +top-level loop. See [below](#code) for full code. + + +{% highlight python %} + # Run the main "interpreter loop". + while True: + print 'ready>', + try: + raw = raw_input() + except KeyboardInterrupt: + return + + parser = Parser(Tokenize(raw), operator_precedence) + while True: + # top ::= definition | external | expression | EOF + if isinstance(parser.current, EOFToken): + break + if isinstance(parser.current, DefToken): + parser.HandleDefinition() + elif isinstance(parser.current, ExternToken): + parser.HandleExtern() + else: + parser.HandleTopLevelExpression() +{% endhighlight %} + + +Here we create a new `Parser` for each line read, and try to parse out +all the expressions, declarations and definitions in the line. We also allow the +user to quit using Ctrl+C. + + + +* * * + +# Conclusions # {#conclusions} + + +With just under 330 lines of commented code (200 lines of non-comment, +non-blank code), we fully defined our minimal language, including a lexer, +parser, and AST builder. With this done, the executable will validate +Kaleidoscope code and tell us if it is grammatically invalid. For +example, here is a sample interaction: + + +{% highlight python %} +$ python kaleidoscope.py +ready> def foo(x y) x+foo(y, 4.0) +Parsed a function definition. +ready> def foo(x y) x+y y +Parsed a function definition. +Parsed a top-level expression. +ready> def foo(x y) x+y ) +Parsed a function definition. +Error: Unknown token when expecting an expression. +ready> extern sin(a); +Parsed an extern. +ready> ^C +$ +{% endhighlight %} + +There is a lot of room for extension here. You can define new AST nodes, +extend the language in many ways, etc. In the +[next installment](PythonLangImpl3.html), we will describe how to +generate LLVM Intermediate Representation (IR) from the AST. + + +* * * + +# Full Code Listing # {#code} + + +Here is the complete code listing for this and the previous chapter. +Note that it is fully self-contained: you don't need LLVM or any external +libraries at all for this. + + +{% highlight python %} +#!/usr/bin/env python + +import re + +################################################################################ +## Lexer +################################################################################ + +# The lexer yields one of these types for each token. +class EOFToken(object): + pass + +class DefToken(object): + pass + +class ExternToken(object): + pass + +class IdentifierToken(object): + def __init__(self, name): self.name = name + +class NumberToken(object): + def __init__(self, value): self.value = value + +class CharacterToken(object): + def __init__(self, char): self.char = char + def __eq__(self, other): + return isinstance(other, CharacterToken) and self.char == other.char + def __ne__(self, other): return not self == other + +# Regular expressions that tokens and comments of our language. +REGEX_NUMBER = re.compile('[0-9]+(?:\.[0-9]+)?') +REGEX_IDENTIFIER = re.compile('[a-zA-Z][a-zA-Z0-9]*') +REGEX_COMMENT = re.compile('#.*') + +def Tokenize(string): + while string: + # Skip whitespace. + if string[0].isspace(): + string = string[1:] + continue + + # Run regexes. + comment_match = REGEX_COMMENT.match(string) + number_match = REGEX_NUMBER.match(string) + identifier_match = REGEX_IDENTIFIER.match(string) + + # Check if any of the regexes matched and yield the appropriate result. + if comment_match: + comment = comment_match.group(0) + string = string[len(comment):] + elif number_match: + number = number_match.group(0) + yield NumberToken(float(number)) + string = string[len(number):] + elif identifier_match: + identifier = identifier_match.group(0) + # Check if we matched a keyword. + if identifier == 'def': + yield DefToken() + elif identifier == 'extern': + yield ExternToken() + else: + yield IdentifierToken(identifier) + string = string[len(identifier):] + else: + # Yield the ASCII value of the unknown character. + yield CharacterToken(string[0]) + string = string[1:] + + yield EOFToken() + +################################################################################ +## Abstract Syntax Tree (aka Parse Tree) +################################################################################ + +# Base class for all expression nodes. +class ExpressionNode(object): + pass + +# Expression class for numeric literals like "1.0". +class NumberExpressionNode(ExpressionNode): + def __init__(self, value): + self.value = value + +# Expression class for referencing a variable, like "a". +class VariableExpressionNode(ExpressionNode): + def __init__(self, name): + self.name = name + +# Expression class for a binary operator. +class BinaryOperatorExpressionNode(ExpressionNode): + def __init__(self, operator, left, right): + self.operator = operator + self.left = left + self.right = right + +# Expression class for function calls. +class CallExpressionNode(ExpressionNode): + def __init__(self, callee, args): + self.callee = callee + self.args = args + +# This class represents the "prototype" for a function, which captures its name, +# and its argument names (thus implicitly the number of arguments the function +# takes). +class PrototypeNode(object): + def __init__(self, name, args): + self.name = name + self.args = args + +# This class represents a function definition itself. +class FunctionNode(object): + def __init__(self, prototype, body): + self.prototype = prototype + self.body = body + + +################################################################################ +## Parser +################################################################################ + +class Parser(object): + + def __init__(self, tokens, binop_precedence): + self.tokens = tokens + self.binop_precedence = binop_precedence + self.Next() + + # Provide a simple token buffer. Parser.current is the current token the + # parser is looking at. Parser.Next() reads another token from the lexer and + # updates Parser.current with its results. + def Next(self): + self.current = self.tokens.next() + + # Gets the precedence of the current token, or -1 if the token is not a binary + # operator. + def GetCurrentTokenPrecedence(self): + if isinstance(self.current, CharacterToken): + return self.binop_precedence.get(self.current.char, -1) + else: + return -1 + + # identifierexpr ::= identifier | identifier '(' expression* ')' + def ParseIdentifierExpr(self): + identifier_name = self.current.name + self.Next() # eat identifier. + + if self.current != CharacterToken('('): # Simple variable reference. + return VariableExpressionNode(identifier_name) + + # Call. + self.Next() # eat '('. + args = [] + if self.current != CharacterToken(')'): + while True: + args.append(self.ParseExpression()) + if self.current == CharacterToken(')'): + break + elif self.current != CharacterToken(','): + raise RuntimeError('Expected ")" or "," in argument list.') + self.Next() + + self.Next() # eat ')'. + return CallExpressionNode(identifier_name, args) + + # numberexpr ::= number + def ParseNumberExpr(self): + result = NumberExpressionNode(self.current.value) + self.Next() # consume the number. + return result + + # parenexpr ::= '(' expression ')' + def ParseParenExpr(self): + self.Next() # eat '('. + + contents = self.ParseExpression() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")".') + self.Next() # eat ')'. + + return contents + + # primary ::= identifierexpr | numberexpr | parenexpr + def ParsePrimary(self): + if isinstance(self.current, IdentifierToken): + return self.ParseIdentifierExpr() + elif isinstance(self.current, NumberToken): + return self.ParseNumberExpr() + elif self.current == CharacterToken('('): + return self.ParseParenExpr() + else: + raise RuntimeError('Unknown token when expecting an expression.') + + # binoprhs ::= (operator primary)* + def ParseBinOpRHS(self, left, left_precedence): + # If this is a binary operator, find its precedence. + while True: + precedence = self.GetCurrentTokenPrecedence() + + # If this is a binary operator that binds at least as tightly as the + # current one, consume it; otherwise we are done. + if precedence < left_precedence: + return left + + binary_operator = self.current.char + self.Next() # eat the operator. + + # Parse the primary expression after the binary operator. + right = self.ParsePrimary() + + # If binary_operator binds less tightly with right than the operator after + # right, let the pending operator take right as its left. + next_precedence = self.GetCurrentTokenPrecedence() + if precedence < next_precedence: + right = self.ParseBinOpRHS(right, precedence + 1) + + # Merge left/right. + left = BinaryOperatorExpressionNode(binary_operator, left, right) + + # expression ::= primary binoprhs + def ParseExpression(self): + left = self.ParsePrimary() + return self.ParseBinOpRHS(left, 0) + + # prototype ::= id '(' id* ')' + def ParsePrototype(self): + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected function name in prototype.') + + function_name = self.current.name + self.Next() # eat function name. + + if self.current != CharacterToken('('): + raise RuntimeError('Expected "(" in prototype.') + self.Next() # eat '('. + + arg_names = [] + while isinstance(self.current, IdentifierToken): + arg_names.append(self.current.name) + self.Next() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")" in prototype.') + + # Success. + self.Next() # eat ')'. + + return PrototypeNode(function_name, arg_names) + + # definition ::= 'def' prototype expression + def ParseDefinition(self): + self.Next() # eat def. + proto = self.ParsePrototype() + body = self.ParseExpression() + return FunctionNode(proto, body) + + # toplevelexpr ::= expression + def ParseTopLevelExpr(self): + proto = PrototypeNode('', []) + return FunctionNode(proto, self.ParseExpression()) + + # external ::= 'extern' prototype + def ParseExtern(self): + self.Next() # eat extern. + return self.ParsePrototype() + + # Top-Level parsing + def HandleDefinition(self): + self.Handle(self.ParseDefinition, 'Parsed a function definition.') + + def HandleExtern(self): + self.Handle(self.ParseExtern, 'Parsed an extern.') + + def HandleTopLevelExpression(self): + self.Handle(self.ParseTopLevelExpr, 'Parsed a top-level expression.') + + def Handle(self, function, message): + try: + function() + print message + except Exception, e: + print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + +################################################################################ +## Main driver code. +################################################################################ + +def main(): + # Install standard binary operators. + # 1 is lowest possible precedence. 40 is the highest. + operator_precedence = { + '<': 10, + '+': 20, + '-': 20, + '*': 40 + } + + # Run the main "interpreter loop". + while True: + print 'ready>', + try: + raw = raw_input() + except KeyboardInterrupt: + return + + parser = Parser(Tokenize(raw), operator_precedence) + while True: + # top ::= definition | external | expression | EOF + if isinstance(parser.current, EOFToken): + break + if isinstance(parser.current, DefToken): + parser.HandleDefinition() + elif isinstance(parser.current, ExternToken): + parser.HandleExtern() + else: + parser.HandleTopLevelExpression() + +if __name__ == '__main__': + main() +{% endhighlight %} + + +* * * + +**[Next: Implementing Code Generation to LLVM IR](PythonLangImpl3.html)** diff --git a/docs/source/doc/kaleidoscope/PythonLangImpl3.md b/docs/source/doc/kaleidoscope/PythonLangImpl3.md new file mode 100644 index 0000000..5045592 --- /dev/null +++ b/docs/source/doc/kaleidoscope/PythonLangImpl3.md @@ -0,0 +1,1062 @@ +--- +layout: page +title: "Kaleidoscope: Chapter 3" +--- + +# Code generation to LLVM IR + +Written by [Chris Lattner](mailto:sabre@nondot.org) +and [Max Shawabkeh](http://max99x.com) + + +**Chapter 3** + + +* This will become a table of contents (this text will be scraped). +{:toc} + + +**[Chapter 4: Adding JIT and Optimizer Support](PythonLangImpl4.html)** + + +# Introduction # {#intro} + +Welcome to Chapter 3 of the +[Implementing a language with LLVM](http://www.llvm.org/docs/tutorial/index.html) +tutorial. This chapter shows you how to transform the +[Abstract Syntax Tree](PythonLangImpl2.html), built in Chapter 2, into +LLVM IR. This will teach you a little bit about how LLVM does things, as well as +demonstrate how easy it is to use. It's much more work to build a lexer and +parser than it is to generate LLVM IR code. :) + + +**Please note**: the code in this chapter and later requires llvm-py 0.6 +and LLVM 2.7. Earlier versions will most likely not work with it. Also note +that you need to use a version of this tutorial that matches your llvm-py +release: If you are using an official llvm-py release, use the version of the +documentation on the [llvm-py examples page](http://www.mdevan.org/llvm-py/examples.html) + +* * * + +# Code Generation Setup # {#basics} + + +In order to generate LLVM IR, we want some simple setup to get started. First +we define code generation methods in each AST node class: + + +{% highlight python %} +# Expression class for numeric literals like "1.0". +class NumberExpressionNode(ExpressionNode): + + def __init__(self, value): + self.value = value + + def CodeGen(self): + ... + +# Expression class for referencing a variable, like "a". +class VariableExpressionNode(ExpressionNode): + + def __init__(self, name): + self.name = name + + def CodeGen(self): + ... + +... +{% endhighlight %} + + +The `CodeGen` method says to emit IR for that AST node along with all +the things it depends on, and they all return an LLVM Value object. "Value" is +the class used to represent a +"[Static Single Assignment (SSA)](http://en.wikipedia.org/wiki/Static_single_assignment_form) +register" or "SSA value" in LLVM. The most distinct aspect +of SSA values is that their value is computed as the related instruction +executes, and it does not get a new value until (and if) the instruction +re-executes. In other words, there is no way to "change" an SSA value. For +more information, please read up on +[Static Single Assignment](http://en.wikipedia.org/wiki/Static_single_assignment_form) + - the concepts are really quite natural once you grok them. + +We will also need to define some global variables which we will be used +during code generation: + + +{% highlight python %} +# The LLVM module, which holds all the IR code. +g_llvm_module = Module.new('my cool jit') + +# The LLVM instruction builder. Created whenever a new function is entered. +g_llvm_builder = None + +# A dictionary that keeps track of which values are defined in the current scope +# and what their LLVM representation is. +g_named_values = {} +{% endhighlight %} + + +`g_llvm_module` is the LLVM construct that contains all of the +functions and global variables in a chunk of code. In many ways, it is the +top-level structure that the LLVM IR uses to contain code. + +`g_llvm_builder` is a helper object that makes it easy to generate +LLVM instructions. Instances of the +[llvm.core.Builder](llvm.core.Builder.html) class keep track of the current place to insert +instructions and have methods to create new instructions. Note that we do not +initialize this variable; instead, it will be initialized whenever we start +generating code for a function. + +Finally, `g_named_values` is a dictionary that keeps track of which +values are defined in the current scope and what their LLVM representation is. +In other words, it is a symbol table for the code. In this form of +Kaleidoscope, the only things that can be referenced are function parameters. +As such, function parameters will be in this map when generating code for their +function body. + + +With these basics in place, we can start talking about how to generate code for +each expression. Note that this assumes that `g_llvm_builder` has been +set up to generate code *into* something. For now, we'll assume that +this has already been done, and we'll just use it to emit code. + + + + +* * * + + +# Expression Code Generation # {#exprs} + + +Generating LLVM code for expression nodes is very straightforward: less +than 35 lines of commented code for all four of our expression nodes. First +we'll do numeric literals: + + +{% highlight python %} + def CodeGen(self): + return Constant.real(Type.double(), self.value) +{% endhighlight %} + + +In llvm-py, floating point numeric constants are represented with the +`llvm.core.ConstantFP` class. To create one, we can use the static +`real()` method in the `llvm.core.Constant` class. This code +basically just creates and returns a `ConstantFP`. Note that in the LLVM +IR constants are all uniqued together and shared. For this reason, we create +the constant through a factory method instead of instantiating one directly. + + +{% highlight python %} + def CodeGen(self): + if self.name in g_named_values: + return g_named_values[self.name] + else: + raise RuntimeError('Unknown variable name: ' + self.name) +{% endhighlight %} + + +References to variables are also quite simple using LLVM. In the simple +version of Kaleidoscope, we assume that the variable has already been emitted +somewhere and its value is available. In practice, the only values that can be +in the `g_named_values` dictionary are function arguments. This code +simply checks to see that the specified name is in the map (if not, an unknown +variable is being referenced) and returns the value for it. In future chapters, +we'll add support for +[loop induction variables](PythonLangImpl5.html#for) in the symbol table, +and for [local variables](PythonLangImpl7.html#localvars). + +{% highlight python %} + def CodeGen(self): + left = self.left.CodeGen() + right = self.right.CodeGen() + + if self.operator == '+': + return g_llvm_builder.fadd(left, right, 'addtmp') + elif self.operator == '-': + return g_llvm_builder.fsub(left, right, 'subtmp') + elif self.operator == '*': + return g_llvm_builder.fmul(left, right, 'multmp') + elif self.operator == '<': + result = g_llvm_builder.fcmp(FCMP_ULT, left, right, 'cmptmp') + # Convert bool 0 or 1 to double 0.0 or 1.0. + return g_llvm_builder.uitofp(result, Type.double(), 'booltmp') + else: + raise RuntimeError('Unknown binary operator.') +{% endhighlight %} + + +Binary operators start to get more interesting. The basic idea here is that +we recursively emit code for the left-hand side of the expression, then the +right-hand side, then we compute the result of the binary expression depending +on which operator is being used. + + +In the example above, the LLVM builder class is starting to show its value. +`g_llvm_builder` knows where to insert the newly created instruction, all +you have to do is specify what instruction to create (e.g. with `add`), +which operands to use (`left` and `right` here) and optionally +provide a name for the generated instruction. + +One nice thing about LLVM is that the name is just a hint. For instance, if +the code above emits multiple "addtmp" variables, LLVM will automatically +provide each one with an increasing, unique numeric suffix. Local value names +for instructions are purely optional, but it makes it much easier to read the +IR dumps. + +[LLVM instructions](http://www.llvm.org/docs/LangRef.html#instref) +are constrained by strict rules: for example, the Left and Right operators of +an [add instruction](http://www.llvm.org/docs/LangRef.html#i_add) +must have the same type, and the result type of the add must match the operand +types. Because all values in Kaleidoscope are doubles, this makes for very +simple code for add, sub and mul. + +On the other hand, LLVM specifies that the +[fcmp instruction](http://www.llvm.org/docs/LangRef.html#i_fcmp) always +returns an 'i1' value (a one bit integer). The problem with this is that +Kaleidoscope wants the value to be a 0.0 or 1.0 value. In order to get these +semantics, we combine the fcmp instruction with a +[uitofp instruction](http://www.llvm.org/docs/LangRef.html#i_uitofp). +This instruction converts its input integer into a floating point value by +treating the input as an unsigned value. In contrast, if we used the +[sitofp instruction](http://www.llvm.org/docs/LangRef.html#i_sitofp), +the Kaleidoscope `<` operator would return 0.0 and -1.0, depending on the +input value. + + +{% highlight python %} + def CodeGen(self): + # Look up the name in the global module table. + callee = g_llvm_module.get_function_named(self.callee) + + # Check for argument mismatch error. + if len(callee.args) != len(self.args): + raise RuntimeError('Incorrect number of arguments passed.') + + arg_values = [i.CodeGen() for i in self.args] + + return g_llvm_builder.call(callee, arg_values, 'calltmp') +{% endhighlight %} + + +Code generation for function calls is quite straightforward with LLVM. The +code above initially does a function name lookup in the LLVM Module's symbol +table. Recall that the LLVM Module is the container that holds all of the +functions we are JIT'ing. By giving each function the same name as what the +user specifies, we can use the LLVM symbol table to resolve function names for +us. + +Once we have the function to call, we codegen each argument that is to be +passed in, and create an LLVM +[call instruction](http://www.llvm.org/docs/LangRef.html#i_call). +Note that LLVM uses the native C calling conventions by default, allowing these +calls to also call into standard library functions like "sin" and "cos", with no +additional effort. + +This wraps up our handling of the four basic expressions that we have so far +in Kaleidoscope. Feel free to go in and add some more. For example, by +browsing the [LLVM language reference](http://www.llvm.org/docs/LangRef.html) +you'll find several other interesting instructions that are really +easy to plug into our basic framework. + + +* * * + + +# Function Code Generation # {#funcs} + + +Code generation for prototypes and functions must handle a number of +details, which make their code less beautiful than expression code +generation, but allows us to illustrate some important points. First, let's +talk about code generation for prototypes: they are used both for function +bodies and external function declarations. The code starts with: + + +{% highlight python %} + def CodeGen(self): + # Make the function type, eg. double(double,double). + funct_type = Type.function( + Type.double(), [Type.double()] * len(self.args), False) + + function = Function.new(g_llvm_module, funct_type, self.name) +{% endhighlight %} + + +The call to `Type.function` creates the `FunctionType` that +should be used for a given Prototype. Since all function arguments in +Kaleidoscope are of type double, the first line creates a list of "N" LLVM +double types. It then uses the `Type.function` method to create a +function type that takes "N" doubles as arguments, returns one double as a +result, and that is not vararg (the False parameter indicates this). Note that +Types in LLVM are uniqued just like Constants are, so you don't instantiate them +directly. + +The final line above actually creates the function that the prototype will +correspond to. This indicates the type and name to use, as well as which +module to insert into. Note that by default, the function will have +[external linkage]( 4+5 +Read a top-level expression: +define double @0() { +entry: + ret double 9.000000e+00 +} +{% endhighlight %} + + +Note how the parser turns the top-level expression into anonymous functions +for us. This will be handy when we add JIT +support in the next chapter. Also note that the code is very literally +transcribed, no optimizations are being performed except simple constant +folding done by the Builder. We will +add optimizations explicitly +in the next chapter. + + +{% highlight bash %} +ready> def foo(a b) a*a + 2*a*b + b*b +Read a function definition: +define double @foo(double %a, double %b) { +entry: + %multmp = fmul double %a, %a ; [#uses=1] + %multmp1 = fmul double 2.000000e+00, %a ; [#uses=1] + %multmp2 = fmul double %multmp1, %b ; [#uses=1] + %addtmp = fadd double %multmp, %multmp2 ; [#uses=1] + %multmp3 = fmul double %b, %b ; [#uses=1] + %addtmp4 = fadd double %addtmp, %multmp3 ; [#uses=1] + ret double %addtmp4 +} +{% endhighlight %} + + +This shows some simple arithmetic. Notice the striking similarity to the +LLVM builder calls that we use to create the instructions. + + +{% highlight bash %} +ready> def bar(a) foo(a, 4.0) + bar(31337) +Read a function definition: +define double @bar(double %a) { +entry: + %calltmp = call double @foo(double %a, double 4.000000e+00) ; [#uses=1] + %calltmp1 = call double @bar(double 3.133700e+04) ; [#uses=1] + %addtmp = fadd double %calltmp, %calltmp1 ; [#uses=1] + ret double %addtmp +} +{% endhighlight %} + + +This shows some function calls. Note that this function will take a long +time to execute if you call it. In the future we'll add conditional control +flow to actually make recursion useful :). + + +{% highlight bash %} +ready> extern cos(x) +Read extern: +declare double @cos(double) + +ready> cos(1.234) +Read a top-level expression: +define double @1() { +entry: + %calltmp = call double @cos(double 1.234000e+00) ; [#uses=1] + ret double %calltmp +} +{% endhighlight %} + + +This shows an extern for the libm "cos" function, and a call to it. + + + +{% highlight bash %} +ready> ^C +; ModuleID = 'my cool jit' + +define double @0() { +entry: + ret double 9.000000e+00 +} + +define double @foo(double %a, double %b) { +entry: + %multmp = fmul double %a, %a ; [#uses=1] + %multmp1 = fmul double 2.000000e+00, %a ; [#uses=1] + %multmp2 = fmul double %multmp1, %b ; [#uses=1] + %addtmp = fadd double %multmp, %multmp2 ; [#uses=1] + %multmp3 = fmul double %b, %b ; [#uses=1] + %addtmp4 = fadd double %addtmp, %multmp3 ; [#uses=1] + ret double %addtmp4 +} + +define double @bar(double %a) { +entry: + %calltmp = call double @foo(double %a, double 4.000000e+00) ; [#uses=1] + %calltmp1 = call double @bar(double 3.133700e+04) ; [#uses=1] + %addtmp = fadd double %calltmp, %calltmp1 ; [#uses=1] + ret double %addtmp +} + +declare double @cos(double) + +define double @1() { +entry: + %calltmp = call double @cos(double 1.234000e+00) ; [#uses=1] + ret double %calltmp +} +{% endhighlight %} + + +When you quit the current demo, it dumps out the IR for the entire module +generated. Here you can see the big picture with all the functions referencing +each other. + +This wraps up the third chapter of the Kaleidoscope tutorial. Up next, we'll +describe how to [add JIT codegen and optimizer support](PythonLangImpl4.html) +to this so we can actually start running code! + + + + +* * * + + +# Full Code Listing # {#code} + +Here is the complete code listing for our running example, enhanced with the +LLVM code generator. Because this uses the llvm-py libraries, you need to +[download](../download.html) and +[install](../userguide.html#install) them. + + + +{% highlight python %} +#!/usr/bin/env python + +import re +from llvm.core import Module, Constant, Type, Function, Builder, FCMP_ULT + +################################################################################ +## Globals +################################################################################ + +# The LLVM module, which holds all the IR code. +g_llvm_module = Module.new('my cool jit') + +# The LLVM instruction builder. Created whenever a new function is entered. +g_llvm_builder = None + +# A dictionary that keeps track of which values are defined in the current scope +# and what their LLVM representation is. +g_named_values = {} + +################################################################################ +## Lexer +################################################################################ + +# The lexer yields one of these types for each token. +class EOFToken(object): + pass + +class DefToken(object): + pass + +class ExternToken(object): + pass + +class IdentifierToken(object): + def __init__(self, name): self.name = name + +class NumberToken(object): + def __init__(self, value): self.value = value + +class CharacterToken(object): + def __init__(self, char): self.char = char + def __eq__(self, other): + return isinstance(other, CharacterToken) and self.char == other.char + def __ne__(self, other): return not self == other + +# Regular expressions that tokens and comments of our language. +REGEX_NUMBER = re.compile('[0-9]+(?:\.[0-9]+)?') +REGEX_IDENTIFIER = re.compile('[a-zA-Z][a-zA-Z0-9]*') +REGEX_COMMENT = re.compile('#.*') + +def Tokenize(string): + while string: + # Skip whitespace. + if string[0].isspace(): + string = string[1:] + continue + + # Run regexes. + comment_match = REGEX_COMMENT.match(string) + number_match = REGEX_NUMBER.match(string) + identifier_match = REGEX_IDENTIFIER.match(string) + + # Check if any of the regexes matched and yield the appropriate result. + if comment_match: + comment = comment_match.group(0) + string = string[len(comment):] + elif number_match: + number = number_match.group(0) + yield NumberToken(float(number)) + string = string[len(number):] + elif identifier_match: + identifier = identifier_match.group(0) + # Check if we matched a keyword. + if identifier == 'def': + yield DefToken() + elif identifier == 'extern': + yield ExternToken() + else: + yield IdentifierToken(identifier) + string = string[len(identifier):] + else: + # Yield the ASCII value of the unknown character. + yield CharacterToken(string[0]) + string = string[1:] + + yield EOFToken() + +################################################################################ +## Abstract Syntax Tree (aka Parse Tree) +################################################################################ + +# Base class for all expression nodes. +class ExpressionNode(object): + pass + +# Expression class for numeric literals like "1.0". +class NumberExpressionNode(ExpressionNode): + + def __init__(self, value): + self.value = value + + def CodeGen(self): + return Constant.real(Type.double(), self.value) + +# Expression class for referencing a variable, like "a". +class VariableExpressionNode(ExpressionNode): + + def __init__(self, name): + self.name = name + + def CodeGen(self): + if self.name in g_named_values: + return g_named_values[self.name] + else: + raise RuntimeError('Unknown variable name: ' + self.name) + +# Expression class for a binary operator. +class BinaryOperatorExpressionNode(ExpressionNode): + + def __init__(self, operator, left, right): + self.operator = operator + self.left = left + self.right = right + + def CodeGen(self): + left = self.left.CodeGen() + right = self.right.CodeGen() + + if self.operator == '+': + return g_llvm_builder.fadd(left, right, 'addtmp') + elif self.operator == '-': + return g_llvm_builder.fsub(left, right, 'subtmp') + elif self.operator == '*': + return g_llvm_builder.fmul(left, right, 'multmp') + elif self.operator == '<': + result = g_llvm_builder.fcmp(FCMP_ULT, left, right, 'cmptmp') + # Convert bool 0 or 1 to double 0.0 or 1.0. + return g_llvm_builder.uitofp(result, Type.double(), 'booltmp') + else: + raise RuntimeError('Unknown binary operator.') + +# Expression class for function calls. +class CallExpressionNode(ExpressionNode): + + def __init__(self, callee, args): + self.callee = callee + self.args = args + + def CodeGen(self): + # Look up the name in the global module table. + callee = g_llvm_module.get_function_named(self.callee) + + # Check for argument mismatch error. + if len(callee.args) != len(self.args): + raise RuntimeError('Incorrect number of arguments passed.') + + arg_values = [i.CodeGen() for i in self.args] + + return g_llvm_builder.call(callee, arg_values, 'calltmp') + +# This class represents the "prototype" for a function, which captures its name, +# and its argument names (thus implicitly the number of arguments the function +# takes). +class PrototypeNode(object): + + def __init__(self, name, args): + self.name = name + self.args = args + + def CodeGen(self): + # Make the function type, eg. double(double,double). + funct_type = Type.function( + Type.double(), [Type.double()] * len(self.args), False) + + function = Function.new(g_llvm_module, funct_type, self.name) + + # If the name conflicted, there was already something with the same name. + # If it has a body, don't allow redefinition or reextern. + if function.name != self.name: + function.delete() + function = g_llvm_module.get_function_named(self.name) + + # If the function already has a body, reject this. + if not function.is_declaration: + raise RuntimeError('Redefinition of function.') + + # If F took a different number of args, reject. + if len(callee.args) != len(self.args): + raise RuntimeError('Redeclaration of a function with different number ' + 'of args.') + + # Set names for all arguments and add them to the variables symbol table. + for arg, arg_name in zip(function.args, self.args): + arg.name = arg_name + # Add arguments to variable symbol table. + g_named_values[arg_name] = arg + + return function + +# This class represents a function definition itself. +class FunctionNode(object): + + def __init__(self, prototype, body): + self.prototype = prototype + self.body = body + + def CodeGen(self): + # Clear scope. + g_named_values.clear() + + # Create a function object. + function = self.prototype.CodeGen() + + # Create a new basic block to start insertion into. + block = function.append_basic_block('entry') + global g_llvm_builder + g_llvm_builder = Builder.new(block) + + # Finish off the function. + try: + return_value = self.body.CodeGen() + g_llvm_builder.ret(return_value) + + # Validate the generated code, checking for consistency. + function.verify() + except: + function.delete() + raise + + return function + + +################################################################################ +## Parser +################################################################################ + +class Parser(object): + + def __init__(self, tokens, binop_precedence): + self.tokens = tokens + self.binop_precedence = binop_precedence + self.Next() + + # Provide a simple token buffer. Parser.current is the current token the + # parser is looking at. Parser.Next() reads another token from the lexer and + # updates Parser.current with its results. + def Next(self): + self.current = self.tokens.next() + + # Gets the precedence of the current token, or -1 if the token is not a binary + # operator. + def GetCurrentTokenPrecedence(self): + if isinstance(self.current, CharacterToken): + return self.binop_precedence.get(self.current.char, -1) + else: + return -1 + + # identifierexpr ::= identifier | identifier '(' expression* ')' + def ParseIdentifierExpr(self): + identifier_name = self.current.name + self.Next() # eat identifier. + + if self.current != CharacterToken('('): # Simple variable reference. + return VariableExpressionNode(identifier_name) + + # Call. + self.Next() # eat '('. + args = [] + if self.current != CharacterToken(')'): + while True: + args.append(self.ParseExpression()) + if self.current == CharacterToken(')'): + break + elif self.current != CharacterToken(','): + raise RuntimeError('Expected ")" or "," in argument list.') + self.Next() + + self.Next() # eat ')'. + return CallExpressionNode(identifier_name, args) + + # numberexpr ::= number + def ParseNumberExpr(self): + result = NumberExpressionNode(self.current.value) + self.Next() # consume the number. + return result + + # parenexpr ::= '(' expression ')' + def ParseParenExpr(self): + self.Next() # eat '('. + + contents = self.ParseExpression() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")".') + self.Next() # eat ')'. + + return contents + + # primary ::= identifierexpr | numberexpr | parenexpr + def ParsePrimary(self): + if isinstance(self.current, IdentifierToken): + return self.ParseIdentifierExpr() + elif isinstance(self.current, NumberToken): + return self.ParseNumberExpr() + elif self.current == CharacterToken('('): + return self.ParseParenExpr() + else: + raise RuntimeError('Unknown token when expecting an expression.') + + # binoprhs ::= (operator primary)* + def ParseBinOpRHS(self, left, left_precedence): + # If this is a binary operator, find its precedence. + while True: + precedence = self.GetCurrentTokenPrecedence() + + # If this is a binary operator that binds at least as tightly as the + # current one, consume it; otherwise we are done. + if precedence < left_precedence: + return left + + binary_operator = self.current.char + self.Next() # eat the operator. + + # Parse the primary expression after the binary operator. + right = self.ParsePrimary() + + # If binary_operator binds less tightly with right than the operator after + # right, let the pending operator take right as its left. + next_precedence = self.GetCurrentTokenPrecedence() + if precedence < next_precedence: + right = self.ParseBinOpRHS(right, precedence + 1) + + # Merge left/right. + left = BinaryOperatorExpressionNode(binary_operator, left, right) + + # expression ::= primary binoprhs + def ParseExpression(self): + left = self.ParsePrimary() + return self.ParseBinOpRHS(left, 0) + + # prototype ::= id '(' id* ')' + def ParsePrototype(self): + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected function name in prototype.') + + function_name = self.current.name + self.Next() # eat function name. + + if self.current != CharacterToken('('): + raise RuntimeError('Expected "(" in prototype.') + self.Next() # eat '('. + + arg_names = [] + while isinstance(self.current, IdentifierToken): + arg_names.append(self.current.name) + self.Next() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")" in prototype.') + + # Success. + self.Next() # eat ')'. + + return PrototypeNode(function_name, arg_names) + + # definition ::= 'def' prototype expression + def ParseDefinition(self): + self.Next() # eat def. + proto = self.ParsePrototype() + body = self.ParseExpression() + return FunctionNode(proto, body) + + # toplevelexpr ::= expression + def ParseTopLevelExpr(self): + proto = PrototypeNode('', []) + return FunctionNode(proto, self.ParseExpression()) + + # external ::= 'extern' prototype + def ParseExtern(self): + self.Next() # eat extern. + return self.ParsePrototype() + + # Top-Level parsing + def HandleDefinition(self): + self.Handle(self.ParseDefinition, 'Read a function definition:') + + def HandleExtern(self): + self.Handle(self.ParseExtern, 'Read an extern:') + + def HandleTopLevelExpression(self): + self.Handle(self.ParseTopLevelExpr, 'Read a top-level expression:') + + def Handle(self, function, message): + try: + print message, function().CodeGen() + except Exception, e: + print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + +################################################################################ +## Main driver code. +################################################################################ + +def main(): + # Install standard binary operators. + # 1 is lowest possible precedence. 40 is the highest. + operator_precedence = { + '<': 10, + '+': 20, + '-': 20, + '*': 40 + } + + # Run the main "interpreter loop". + while True: + print 'ready>', + try: + raw = raw_input() + except KeyboardInterrupt: + break + + parser = Parser(Tokenize(raw), operator_precedence) + while True: + # top ::= definition | external | expression | EOF + if isinstance(parser.current, EOFToken): + break + if isinstance(parser.current, DefToken): + parser.HandleDefinition() + elif isinstance(parser.current, ExternToken): + parser.HandleExtern() + else: + parser.HandleTopLevelExpression() + + # Print out all of the generated code. + print '\n', g_llvm_module + +if __name__ == '__main__': + main() +{% endhighlight %} + +* * * + +**[Next: Adding JIT and Optimizer Support](PythonLangImpl4.html)** + diff --git a/docs/source/doc/kaleidoscope/PythonLangImpl4.md b/docs/source/doc/kaleidoscope/PythonLangImpl4.md new file mode 100644 index 0000000..d113f49 --- /dev/null +++ b/docs/source/doc/kaleidoscope/PythonLangImpl4.md @@ -0,0 +1,941 @@ +--- +layout: page +title: "Kaleidoscope: Chapter 4" +--- + +# Adding JIT and Optimizer Support + +Written by [Chris Lattner](mailto:sabre@nondot.org) +and [Max Shawabkeh](http://max99x.com) + + +**Chapter 4** + + +* This will become a table of contents (this text will be scraped). +{:toc} + + +**[Chapter 5: Extending the Language: Control Flow](PythonLangImpl5.html)** + + +# Introduction # {#intro} + +Welcome to Chapter 4 of the +[Implementing a language with LLVM](http://www.llvm.org/docs/tutorial/index.html) +tutorial. Chapters 1-3 described the implementation of a simple +language and added support for generating LLVM IR. This chapter describes +two new techniques: adding optimizer support to your language, and adding JIT +compiler support. These additions will demonstrate how to get nice, efficient +code for the Kaleidoscope language. + + +* * * + +# Trivial Constant Folding # {#trivialconstfold} + +Our demonstration for Chapter 3 is elegant and easy to extend. Unfortunately, +it does not produce wonderful code. The LLVM Builder, however, does give us +obvious optimizations when compiling simple code: + + +{% highlight bash %} +ready> def test(x) 1+2+x +Read function definition: +define double @test(double %x) { +entry: + %addtmp = fadd double 3.000000e+00, %x + ret double %addtmp +} +{% endhighlight %} + + +This code is not a literal transcription of the AST built by parsing the +input. That would be: + + +{% highlight bash %} +ready> def test(x) 1+2+x +Read function definition: +define double @test(double %x) { +entry: + %addtmp = fadd double 2.000000e+00, 1.000000e+00 + %addtmp1 = fadd double %addtmp, %x + ret double %addtmp1 +} +{% endhighlight %} + + +Constant folding, as seen above, in particular, is a very common and very +important optimization: so much so that many language implementors implement +constant folding support in their AST representation. + +With LLVM, you don't need this support in the AST. Since all calls to build +LLVM IR go through the LLVM IR builder, the builder itself checked to see if +there was a constant folding opportunity when you call it. If so, it just does +the constant fold and return the constant instead of creating an instruction. + +Well, that was easy :). In practice, we recommend always using +`llvm.core.Builder` when generating code like this. It has no +"syntactic overhead" for its use (you don't have to uglify your compiler with +constant checks everywhere) and it can dramatically reduce the amount of +LLVM IR that is generated in some cases (particular for languages with a macro +preprocessor or that use a lot of constants). + +On the other hand, the `Builder` is limited by the fact that it does +all of its analysis inline with the code as it is built. If you take a slightly +more complex example: + + +{% highlight bash %} +ready> def test(x) (1+2+x)*(x+(1+2)) +Read a function definition: +define double @test(double %x) { +entry: + %addtmp = fadd double 3.000000e+00, %x ; [#uses=1] + %addtmp1 = fadd double %x, 3.000000e+00 ; [#uses=1] + %multmp = fmul double %addtmp, %addtmp1 ; [#uses=1] + ret double %multmp +} +{% endhighlight %} + + +In this case, the LHS and RHS of the multiplication are the same value. We'd +really like to see this generate"`tmp = x+3; result = tmp*tmp;` instead +of computing `x+3` twice. + +Unfortunately, no amount of local analysis will be able to detect and correct +this. This requires two transformations: reassociation of expressions (to +make the add's lexically identical) and Common Subexpression Elimination (CSE) +to delete the redundant add instruction. Fortunately, LLVM provides a broad +range of optimizations that you can use, in the form of "passes". + + +* * * + +# LLVM Optimization Passes # {#optimizerpasses} + +LLVM provides many optimization passes, which do many different sorts of +things and have different tradeoffs. Unlike other systems, LLVM doesn't hold +to the mistaken notion that one set of optimizations is right for all languages +and for all situations. LLVM allows a compiler implementor to make complete +decisions about what optimizations to use, in which order, and in what +situation. + +As a concrete example, LLVM supports both "whole module" passes, which look +across as large of body of code as they can (often a whole file, but if run +at link time, this can be a substantial portion of the whole program). It also +supports and includes "per-function" passes which just operate on a single +function at a time, without looking at other functions. For more information +on passes and how they are run, see the +[How to Write a Pass](http://www.llvm.org/docs/WritingAnLLVMPass.html) +document and the +[List of LLVM Passes](http://www.llvm.org/docs/Passes.html). + +For Kaleidoscope, we are currently generating functions on the fly, one at +a time, as the user types them in. We aren't shooting for the ultimate +optimization experience in this setting, but we also want to catch the easy and +quick stuff where possible. As such, we will choose to run a few per-function +optimizations as the user types the function in. If we wanted to make a "static +Kaleidoscope compiler", we would use exactly the code we have now, except that +we would defer running the optimizer until the entire file has been parsed. + +In order to get per-function optimizations going, we need to set up a +[FunctionPassManager](http://www.llvm.org/docs/WritingAnLLVMPass.html#passmanager) +to hold and organize the LLVM optimizations that we want +to run. Once we have that, we can add a set of optimizations to run. The code +looks like this: + + +{% highlight python %} +# The function optimization passes manager. +g_llvm_pass_manager = FunctionPassManager.new(g_llvm_module) + +# The LLVM execution engine. +g_llvm_executor = ExecutionEngine.new(g_llvm_module) + +... + +def main(): + # Set up the optimizer pipeline. Start with registering info about how the + # target lays out data structures. + g_llvm_pass_manager.add(g_llvm_executor.target_data) + # Do simple "peephole" optimizations and bit-twiddling optzns. + g_llvm_pass_manager.add(PASS_INSTRUCTION_COMBINING) + # Reassociate expressions. + g_llvm_pass_manager.add(PASS_REASSOCIATE) + # Eliminate Common SubExpressions. + g_llvm_pass_manager.add(PASS_GVN) + # Simplify the control flow graph (deleting unreachable blocks, etc). + g_llvm_pass_manager.add(PASS_CFG_SIMPLIFICATION) + + g_llvm_pass_manager.initialize() +{% endhighlight %} + + +This code defines a `FunctionPassManager`, +`g_llvm_pass_manager`. Once it is set up, we use a series of "add" calls +to add a bunch of LLVM passes. The first pass is basically boilerplate, it adds +a pass so that later optimizations know how the data structures in the program +are laid out. (The "`g_llvm_executor`" variable is related to the JIT, +which we will get to in the next section.) In this case, we choose to add 4 +optimization passes. The passes we chose here are a pretty standard set of +"cleanup" optimizations that are useful for a wide variety of code. I won't +delve into what they do but, believe me, they are a good starting place :). + +Once the pass manager is set up, we need to make use of it. We do this by +running it after our newly created function is constructed (in +`FunctionNode.CodeGen`), but before it is returned to the client: + + +{% highlight python %} + return_value = self.body.CodeGen() + g_llvm_builder.ret(return_value) + + # Validate the generated code, checking for consistency. + function.verify() + + # Optimize the function. + g_llvm_pass_manager.run(function) +{% endhighlight %} + + +As you can see, this is pretty straightforward. The +`FunctionPassManager` optimizes and updates the LLVM Function in place, +improving (hopefully) its body. With this in place, we can try our test above +again: + + +{% highlight bash %} +ready> def test(x) (1+2+x)*(x+(1+2)) +Read a function definition: +define double @test(double %x) { +entry: + %addtmp = fadd double %x, 3.000000e+00 ; [#uses=2] + %multmp = fmul double %addtmp, %addtmp ; [#uses=1] + ret double %multmp +} +{% endhighlight %} + + +As expected, we now get our nicely optimized code, saving a floating point +add instruction from every execution of this function. + +LLVM provides a wide variety of optimizations that can be used in certain +circumstances. Some +[documentation about the various passes](http://www.llvm.org/docs/Passes.html) +is available, but it isn't very complete. Another good source of +ideas can come from looking at the passes that `llvm-gcc` or +`llvm-ld` run to get started. The `opt` tool allows you to +experiment with passes from the command line, so you can see if they do +anything. + +Now that we have reasonable code coming out of our front-end, lets talk about +executing it! + +* * * + +# Adding a JIT Compiler # {#jit} + +Code that is available in LLVM IR can have a wide variety of tools +applied to it. For example, you can run optimizations on it (as we did above), +you can dump it out in textual or binary forms, you can compile the code to an +assembly file (.s) for some target, or you can JIT compile it. The nice thing +about the LLVM IR representation is that it is the "common currency" between +many different parts of the compiler. + + +In this section, we'll add JIT compiler support to our interpreter. The +basic idea that we want for Kaleidoscope is to have the user enter function +bodies as they do now, but immediately evaluate the top-level expressions they +type in. For example, if they type in "1 + 2", we should evaluate and print +out 3. If they define a function, they should be able to call it from the +command line. + +In order to do this, we first declare and initialize the JIT. This is done +by adding and initializing a global variable: + + +{% highlight python %} +# The LLVM execution engine. +g_llvm_executor = ExecutionEngine.new(g_llvm_module) +{% endhighlight %} + + +This creates an abstract "Execution Engine" which can be either a JIT +compiler or the LLVM interpreter. LLVM will automatically pick a JIT compiler +for you if one is available for your platform, otherwise it will fall back to +the interpreter. + +Once the `ExecutionEngine` is created, the JIT is ready to be used. +We can use the `run_function` method of the execution engine to execute +a compiled function and get its return value. In our case, this means that we +can change the code that parses a top-level expression to look like this: + + +{% highlight python %} + def HandleTopLevelExpression(self): + try: + function = self.ParseTopLevelExpr().CodeGen() + result = g_llvm_executor.run_function(function, []) + print 'Evaluated to:', result.as_real(Type.double()) + except Exception, e: + print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass +{% endhighlight %} + + +Recall that we compile top-level expressions into a self-contained LLVM +function that takes no arguments and returns the computed double. + +With just these two changes, lets see how Kaleidoscope works now! + + +{% highlight python %} +ready> 4+5 +Read a top level expression: +define double @0() { +entry: + ret double 9.000000e+00 +} + +Evaluated to: 9.0 +{% endhighlight %} + + +Well this looks like it is basically working. The dump of the function +shows the "no argument function that always returns double" that we synthesize +for each top-level expression that is typed in. This demonstrates very basic +functionality, but can we do more? + + +{% highlight python %} +ready> def testfunc(x y) x + y*2 +Read a function definition: +define double @testfunc(double %x, double %y) { +entry: + %multmp = fmul double %y, 2.000000e+00 ; [#uses=1] + %addtmp = fadd double %multmp, %x ; [#uses=1] + ret double %addtmp +} + +ready> testfunc(4, 10) +Read a top level expression: +define double @0() { +entry: + %calltmp = call double @testfunc(double 4.000000e+00, double 1.000000e+01) ; [#uses=1] + ret double %calltmp +} + +*Evaluated to: 24.0* +{% endhighlight %} + + +This illustrates that we can now call user code, but there is something a bit +subtle going on here. Note that we only invoke the JIT on the anonymous +functions that *call testfunc*, but we never invoked it +on *testfunc* itself. What actually happened here is that the JIT +scanned for all non-JIT'd functions transitively called from the anonymous +function and compiled all of them before returning from `run_function()`. + + +The JIT provides a number of other more advanced interfaces for things like +freeing allocated machine code, rejit'ing functions to update them, etc. +However, even with this simple code, we get some surprisingly powerful +capabilities - check this out (I removed the dump of the anonymous functions, +you should get the idea by now :) : + + +{% highlight bash %} +ready> extern sin(x) +Read an extern: +declare double @sin(double) + +ready> extern cos(x) +Read an extern: +declare double @cos(double) + +ready> sin(1.0) +*Evaluated to: 0.841470984808* + +ready> def foo(x) sin(x)*sin(x) + cos(x)*cos(x) +Read a function definition: +define double @foo(double %x) { +entry: + %calltmp = call double @sin(double %x) ; [#uses=1] + %calltmp1 = call double @sin(double %x) ; [#uses=1] + %multmp = fmul double %calltmp, %calltmp1 ; [#uses=1] + %calltmp2 = call double @cos(double %x) ; [#uses=1] + %calltmp3 = call double @cos(double %x) ; [#uses=1] + %multmp4 = fmul double %calltmp2, %calltmp3 ; [#uses=1] + %addtmp = fadd double %multmp, %multmp4 ; [#uses=1] + ret double %addtmp +} + +ready> foo(4.0) +*Evaluated to: 1.000000* +{% endhighlight %} + + +Whoa, how does the JIT know about sin and cos? The answer is surprisingly +simple: in this example, the JIT started execution of a function and got to a +function call. It realized that the function was not yet JIT compiled and +invoked the standard set of routines to resolve the function. In this case, +there is no body defined for the function, so the JIT ended up calling +`dlsym("sin")` on the Python process that is hosting our Kaleidoscope +prompt. Since `sin` is defined within the JIT's address space, it +simply patches up calls in the module to call the libm version of `sin` +directly. + +One interesting application of this is that we can now extend the language +by writing arbitrary C++ code to implement operations. For example, we can +create a C file with the following simple function: + + + +{% highlight c %} +#include + +double putchard(double x) { + putchar((char)x); + return 0; +} +{% endhighlight %} + + +We can then compile this into a shared library with GCC: + + +{% highlight bash %} +gcc -shared -fPIC -o putchard.so putchard.c +{% endhighlight %} + + +Now we can load this library into the Python process using +`llvm.core.load_library_permanently` and access it from Kaleidoscope to +produce simple output to the console: + + +{% highlight python %} +>>> import llvm.core +>>> llvm.core.load_library_permanently('/home/max/llvm-py-tutorial/putchard.so') +>>> import kaleidoscope +>>> kaleidoscope.main() +ready> extern putchard(x) +Read an extern: +declare double @putchard(double) + +ready> putchard(65) + putchard(66) + putchard(67) + putchard(10) +*ABC* +Evaluated to: 0.0 +{% endhighlight %} + + +Similar code could be used to implement file I/O, console input, and many +other capabilities in Kaleidoscope. + +This completes the JIT and optimizer chapter of the Kaleidoscope tutorial. At +this point, we can compile a non-Turing-complete programming language, optimize +and JIT compile it in a user-driven way. Next up we'll look into +[extending the language with control flow constructs](PythonLangImpl5.html), +tackling some interesting LLVM IR issues along the way. + + +* * * + +# Full Code Listing # {#code} + +Here is the complete code listing for our running example, enhanced with the +LLVM JIT and optimizer: + + + +{% highlight python %} +#!/usr/bin/env python + +import re +from llvm.core import Module, Constant, Type, Function, Builder, FCMP_ULT +from llvm.ee import ExecutionEngine, TargetData +from llvm.passes import FunctionPassManager +from llvm.passes import (PASS_INSTRUCTION_COMBINING, + PASS_REASSOCIATE, + PASS_GVN, + PASS_CFG_SIMPLIFICATION) + +################################################################################ +## Globals +################################################################################ + +# The LLVM module, which holds all the IR code. +g_llvm_module = Module.new('my cool jit') + +# The LLVM instruction builder. Created whenever a new function is entered. +g_llvm_builder = None + +# A dictionary that keeps track of which values are defined in the current scope +# and what their LLVM representation is. +g_named_values = {} + +# The function optimization passes manager. +g_llvm_pass_manager = FunctionPassManager.new(g_llvm_module) + +# The LLVM execution engine. +g_llvm_executor = ExecutionEngine.new(g_llvm_module) + +################################################################################ +## Lexer +################################################################################ + +# The lexer yields one of these types for each token. +class EOFToken(object): + pass + +class DefToken(object): + pass + +class ExternToken(object): + pass + +class IdentifierToken(object): + def __init__(self, name): self.name = name + +class NumberToken(object): + def __init__(self, value): self.value = value + +class CharacterToken(object): + def __init__(self, char): self.char = char + def __eq__(self, other): + return isinstance(other, CharacterToken) and self.char == other.char + def __ne__(self, other): return not self == other + +# Regular expressions that tokens and comments of our language. +REGEX_NUMBER = re.compile('[0-9]+(?:\.[0-9]+)?') +REGEX_IDENTIFIER = re.compile('[a-zA-Z][a-zA-Z0-9]*') +REGEX_COMMENT = re.compile('#.*') + +def Tokenize(string): + while string: + # Skip whitespace. + if string[0].isspace(): + string = string[1:] + continue + + # Run regexes. + comment_match = REGEX_COMMENT.match(string) + number_match = REGEX_NUMBER.match(string) + identifier_match = REGEX_IDENTIFIER.match(string) + + # Check if any of the regexes matched and yield the appropriate result. + if comment_match: + comment = comment_match.group(0) + string = string[len(comment):] + elif number_match: + number = number_match.group(0) + yield NumberToken(float(number)) + string = string[len(number):] + elif identifier_match: + identifier = identifier_match.group(0) + # Check if we matched a keyword. + if identifier == 'def': + yield DefToken() + elif identifier == 'extern': + yield ExternToken() + else: + yield IdentifierToken(identifier) + string = string[len(identifier):] + else: + # Yield the ASCII value of the unknown character. + yield CharacterToken(string[0]) + string = string[1:] + + yield EOFToken() + +################################################################################ +## Abstract Syntax Tree (aka Parse Tree) +################################################################################ + +# Base class for all expression nodes. +class ExpressionNode(object): + pass + +# Expression class for numeric literals like "1.0". +class NumberExpressionNode(ExpressionNode): + + def __init__(self, value): + self.value = value + + def CodeGen(self): + return Constant.real(Type.double(), self.value) + +# Expression class for referencing a variable, like "a". +class VariableExpressionNode(ExpressionNode): + + def __init__(self, name): + self.name = name + + def CodeGen(self): + if self.name in g_named_values: + return g_named_values[self.name] + else: + raise RuntimeError('Unknown variable name: ' + self.name) + +# Expression class for a binary operator. +class BinaryOperatorExpressionNode(ExpressionNode): + + def __init__(self, operator, left, right): + self.operator = operator + self.left = left + self.right = right + + def CodeGen(self): + left = self.left.CodeGen() + right = self.right.CodeGen() + + if self.operator == '+': + return g_llvm_builder.fadd(left, right, 'addtmp') + elif self.operator == '-': + return g_llvm_builder.fsub(left, right, 'subtmp') + elif self.operator == '*': + return g_llvm_builder.fmul(left, right, 'multmp') + elif self.operator == '<': + result = g_llvm_builder.fcmp(FCMP_ULT, left, right, 'cmptmp') + # Convert bool 0 or 1 to double 0.0 or 1.0. + return g_llvm_builder.uitofp(result, Type.double(), 'booltmp') + else: + raise RuntimeError('Unknown binary operator.') + +# Expression class for function calls. +class CallExpressionNode(ExpressionNode): + + def __init__(self, callee, args): + self.callee = callee + self.args = args + + def CodeGen(self): + # Look up the name in the global module table. + callee = g_llvm_module.get_function_named(self.callee) + + # Check for argument mismatch error. + if len(callee.args) != len(self.args): + raise RuntimeError('Incorrect number of arguments passed.') + + arg_values = [i.CodeGen() for i in self.args] + + return g_llvm_builder.call(callee, arg_values, 'calltmp') + +# This class represents the "prototype" for a function, which captures its name, +# and its argument names (thus implicitly the number of arguments the function +# takes). +class PrototypeNode(object): + + def __init__(self, name, args): + self.name = name + self.args = args + + def CodeGen(self): + # Make the function type, eg. double(double,double). + funct_type = Type.function( + Type.double(), [Type.double()] * len(self.args), False) + + function = Function.new(g_llvm_module, funct_type, self.name) + + # If the name conflicted, there was already something with the same name. + # If it has a body, don't allow redefinition or reextern. + if function.name != self.name: + function.delete() + function = g_llvm_module.get_function_named(self.name) + + # If the function already has a body, reject this. + if not function.is_declaration: + raise RuntimeError('Redefinition of function.') + + # If F took a different number of args, reject. + if len(callee.args) != len(self.args): + raise RuntimeError('Redeclaration of a function with different number ' + 'of args.') + + # Set names for all arguments and add them to the variables symbol table. + for arg, arg_name in zip(function.args, self.args): + arg.name = arg_name + # Add arguments to variable symbol table. + g_named_values[arg_name] = arg + + return function + +# This class represents a function definition itself. +class FunctionNode(object): + + def __init__(self, prototype, body): + self.prototype = prototype + self.body = body + + def CodeGen(self): + # Clear scope. + g_named_values.clear() + + # Create a function object. + function = self.prototype.CodeGen() + + # Create a new basic block to start insertion into. + block = function.append_basic_block('entry') + global g_llvm_builder + g_llvm_builder = Builder.new(block) + + # Finish off the function. + try: + return_value = self.body.CodeGen() + g_llvm_builder.ret(return_value) + + # Validate the generated code, checking for consistency. + function.verify() + + # Optimize the function. + g_llvm_pass_manager.run(function) + except: + function.delete() + raise + + return function + + +################################################################################ +## Parser +################################################################################ + +class Parser(object): + + def __init__(self, tokens, binop_precedence): + self.tokens = tokens + self.binop_precedence = binop_precedence + self.Next() + + # Provide a simple token buffer. Parser.current is the current token the + # parser is looking at. Parser.Next() reads another token from the lexer and + # updates Parser.current with its results. + def Next(self): + self.current = self.tokens.next() + + # Gets the precedence of the current token, or -1 if the token is not a binary + # operator. + def GetCurrentTokenPrecedence(self): + if isinstance(self.current, CharacterToken): + return self.binop_precedence.get(self.current.char, -1) + else: + return -1 + + # identifierexpr ::= identifier | identifier '(' expression* ')' + def ParseIdentifierExpr(self): + identifier_name = self.current.name + self.Next() # eat identifier. + + if self.current != CharacterToken('('): # Simple variable reference. + return VariableExpressionNode(identifier_name) + + # Call. + self.Next() # eat '('. + args = [] + if self.current != CharacterToken(')'): + while True: + args.append(self.ParseExpression()) + if self.current == CharacterToken(')'): + break + elif self.current != CharacterToken(','): + raise RuntimeError('Expected ")" or "," in argument list.') + self.Next() + + self.Next() # eat ')'. + return CallExpressionNode(identifier_name, args) + + # numberexpr ::= number + def ParseNumberExpr(self): + result = NumberExpressionNode(self.current.value) + self.Next() # consume the number. + return result + + # parenexpr ::= '(' expression ')' + def ParseParenExpr(self): + self.Next() # eat '('. + + contents = self.ParseExpression() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")".') + self.Next() # eat ')'. + + return contents + + # primary ::= identifierexpr | numberexpr | parenexpr + def ParsePrimary(self): + if isinstance(self.current, IdentifierToken): + return self.ParseIdentifierExpr() + elif isinstance(self.current, NumberToken): + return self.ParseNumberExpr() + elif self.current == CharacterToken('('): + return self.ParseParenExpr() + else: + raise RuntimeError('Unknown token when expecting an expression.') + + # binoprhs ::= (operator primary)* + def ParseBinOpRHS(self, left, left_precedence): + # If this is a binary operator, find its precedence. + while True: + precedence = self.GetCurrentTokenPrecedence() + + # If this is a binary operator that binds at least as tightly as the + # current one, consume it; otherwise we are done. + if precedence < left_precedence: + return left + + binary_operator = self.current.char + self.Next() # eat the operator. + + # Parse the primary expression after the binary operator. + right = self.ParsePrimary() + + # If binary_operator binds less tightly with right than the operator after + # right, let the pending operator take right as its left. + next_precedence = self.GetCurrentTokenPrecedence() + if precedence < next_precedence: + right = self.ParseBinOpRHS(right, precedence + 1) + + # Merge left/right. + left = BinaryOperatorExpressionNode(binary_operator, left, right) + + # expression ::= primary binoprhs + def ParseExpression(self): + left = self.ParsePrimary() + return self.ParseBinOpRHS(left, 0) + + # prototype ::= id '(' id* ')' + def ParsePrototype(self): + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected function name in prototype.') + + function_name = self.current.name + self.Next() # eat function name. + + if self.current != CharacterToken('('): + raise RuntimeError('Expected "(" in prototype.') + self.Next() # eat '('. + + arg_names = [] + while isinstance(self.current, IdentifierToken): + arg_names.append(self.current.name) + self.Next() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")" in prototype.') + + # Success. + self.Next() # eat ')'. + + return PrototypeNode(function_name, arg_names) + + # definition ::= 'def' prototype expression + def ParseDefinition(self): + self.Next() # eat def. + proto = self.ParsePrototype() + body = self.ParseExpression() + return FunctionNode(proto, body) + + # toplevelexpr ::= expression + def ParseTopLevelExpr(self): + proto = PrototypeNode('', []) + return FunctionNode(proto, self.ParseExpression()) + + # external ::= 'extern' prototype + def ParseExtern(self): + self.Next() # eat extern. + return self.ParsePrototype() + + # Top-Level parsing + def HandleDefinition(self): + self.Handle(self.ParseDefinition, 'Read a function definition:') + + def HandleExtern(self): + self.Handle(self.ParseExtern, 'Read an extern:') + + def HandleTopLevelExpression(self): + try: + function = self.ParseTopLevelExpr().CodeGen() + result = g_llvm_executor.run_function(function, []) + print 'Evaluated to:', result.as_real(Type.double()) + except Exception, e: + print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + + def Handle(self, function, message): + try: + print message, function().CodeGen() + except Exception, e: + print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + +################################################################################ +## Main driver code. +################################################################################ + +def main(): + # Set up the optimizer pipeline. Start with registering info about how the + # target lays out data structures. + g_llvm_pass_manager.add(g_llvm_executor.target_data) + # Do simple "peephole" optimizations and bit-twiddling optzns. + g_llvm_pass_manager.add(PASS_INSTRUCTION_COMBINING) + # Reassociate expressions. + g_llvm_pass_manager.add(PASS_REASSOCIATE) + # Eliminate Common SubExpressions. + g_llvm_pass_manager.add(PASS_GVN) + # Simplify the control flow graph (deleting unreachable blocks, etc). + g_llvm_pass_manager.add(PASS_CFG_SIMPLIFICATION) + + g_llvm_pass_manager.initialize() + + # Install standard binary operators. + # 1 is lowest possible precedence. 40 is the highest. + operator_precedence = { + '<': 10, + '+': 20, + '-': 20, + '*': 40 + } + + # Run the main "interpreter loop". + while True: + print 'ready>', + try: + raw = raw_input() + except KeyboardInterrupt: + break + + parser = Parser(Tokenize(raw), operator_precedence) + while True: + # top ::= definition | external | expression | EOF + if isinstance(parser.current, EOFToken): + break + if isinstance(parser.current, DefToken): + parser.HandleDefinition() + elif isinstance(parser.current, ExternToken): + parser.HandleExtern() + else: + parser.HandleTopLevelExpression() + + # Print out all of the generated code. + print '\n', g_llvm_module + +if __name__ == '__main__': + main() +{% endhighlight %} + +* * * + +**[Next: Extending the language: control flow](PythonLangImpl5.html)** + diff --git a/docs/source/doc/kaleidoscope/PythonLangImpl5.md b/docs/source/doc/kaleidoscope/PythonLangImpl5.md new file mode 100644 index 0000000..93f4a36 --- /dev/null +++ b/docs/source/doc/kaleidoscope/PythonLangImpl5.md @@ -0,0 +1,1464 @@ +--- +layout: page +title: "Kaleidoscope: Chapter 5" +--- + +# Extending the Language: Control Flow + +Written by [Chris Lattner](mailto:sabre@nondot.org) +and [Max Shawabkeh](http://max99x.com) + + +**Chapter 5** + + +* This will become a table of contents (this text will be scraped). +{:toc} + + +**[Chapter 6: Extending the Language: User-defined Operators](PythonLangImpl6.html)** + + +# Introduction # {#intro} + + +Welcome to Chapter 5 of the +[Implementing a language with LLVM](http://www.llvm.org/docs/tutorial/index.html) +tutorial. Parts 1-4 described the implementation of the simple +Kaleidoscope language and included support for generating LLVM IR, followed by +optimizations and a JIT compiler. Unfortunately, as presented, Kaleidoscope is +mostly useless: it has no control flow other than call and return. This means +that you can't have conditional branches in the code, significantly limiting its +power. In this episode of "build that compiler", we'll extend Kaleidoscope to +have an if/then/else expression plus a simple 'for' loop. + + +* * * + +# If/Then/Else # {#ifthen} + +Extending Kaleidoscope to support if/then/else is quite straightforward. It +basically requires adding lexer support for this "new" concept to the lexer, +parser, AST, and LLVM code emitter. This example is nice, because it shows how +easy it is to "grow" a language over time, incrementally extending it as new +ideas are discovered. + +Before we get going on "how" we add this extension, lets talk about "what" we +want. The basic idea is that we want to be able to write this sort of thing: + + + +{% highlight python %} +def fib(x) + if x < 3 then + 1 + else + fib(x-1) + fib(x-2) +{% endhighlight %} + + +In Kaleidoscope, every construct is an expression: there are no statements. +As such, the if/then/else expression needs to return a value like any other. +Since we're using a mostly functional form, we'll have it evaluate its +conditional, then return the 'then' or 'else' value based on how the condition +was resolved. This is very similar to the C "?:" expression. + +The semantics of the if/then/else expression is that it evaluates the +condition to a boolean equality value: 0.0 is considered to be false and +everything else is considered to be true. +If the condition is true, the first subexpression is evaluated and returned, if +the condition is false, the second subexpression is evaluated and returned. +Since Kaleidoscope allows side-effects, this behavior is important to nail down. + + +Now that we know what we "want", let's break this down into its constituent +pieces. + + + +## Lexer Extensions for If/Then/Else ## {#iflexer} + +The lexer extensions are straightforward. First we add new token classes for +the relevant tokens: + +{% highlight python %} +class IfToken(object): pass +class ThenToken(object): pass +class ElseToken(object): pass +{% endhighlight %} + + +Once we have that, we recognize the new keywords in the lexer. This is pretty +simple stuff: + + +{% highlight python %} + ... + if identifier == 'def': + yield DefToken() + elif identifier == 'extern': + yield ExternToken() + elif identifier == 'if': + yield IfToken() + elif identifier == 'then': + yield ThenToken() + elif identifier == 'else': + yield ElseToken() + else: + yield IdentifierToken(identifier) +{% endhighlight %} + + +## AST Extensions for If/Then/Else ## {#ifast} + +To represent the new expression we add a new AST node for it: + +{% highlight python %} +# Expression class for if/then/else. +class IfExpressionNode(ExpressionNode): + + def __init__(self, condition, then_branch, else_branch): + self.condition = condition + self.then_branch = then_branch + self.else_branch = else_branch + + def CodeGen(self): + ... +{% endhighlight %} + + +The AST node just has pointers to the various subexpressions. + +## Parser Extensions for If/Then/Else ## {#ifparser} + +Now that we have the relevant tokens coming from the lexer and we have the +AST node to build, our parsing logic is relatively straightforward. First we +define a new parsing function: + + +{% highlight python %} + # ifexpr ::= 'if' expression 'then' expression 'else' expression + def ParseIfExpr(self): + self.Next() # eat the if. + + # condition. + condition = self.ParseExpression() + + if not isinstance(self.current, ThenToken): + raise RuntimeError('Expected "then".') + self.Next() # eat the then. + + then_branch = self.ParseExpression() + + if not isinstance(self.current, ElseToken): + raise RuntimeError('Expected "else".') + self.Next() # eat the else. + + else_branch = self.ParseExpression() + + return IfExpressionNode(condition, then_branch, else_branch) +{% endhighlight %} + + +Next we hook it up as a primary expression: + + +{% highlight python %} + def ParsePrimary(self): + if isinstance(self.current, IdentifierToken): + return self.ParseIdentifierExpr() + elif isinstance(self.current, NumberToken): + return self.ParseNumberExpr(); + elif isinstance(self.current, IfToken): + return self.ParseIfExpr() + elif self.current == CharacterToken('('): + return self.ParseParenExpr() + else: + raise RuntimeError('Unknown token when expecting an expression.') +{% endhighlight %} + + +## LLVM IR for If/Then/Else ## {#ifir} + +Now that we have it parsing and building the AST, the final piece is adding +LLVM code generation support. This is the most interesting part of the +if/then/else example, because this is where it starts to introduce new concepts. +All of the code above has been thoroughly described in previous chapters. + + +To motivate the code we want to produce, lets take a look at a simple +example. Consider: + + +{% highlight python %} +extern foo(); +extern bar(); +def baz(x) if x then foo() else bar(); +{% endhighlight %} + + +If you disable optimizations, the code you'll (soon) get from Kaleidoscope +looks something like this: + + + +{% highlight llvm %} +declare double @foo() +declare double @bar() +define double @baz(double %x) { +entry: + %ifcond = fcmp one double %x, 0.000000e+00 + br i1 %ifcond, label %then, label %else +then: ; preds = %entry + %calltmp1 = call double @bar() +else: ; preds = %entry + %calltmp1 = call double @bar() + br label %ifcont +ifcont: ; preds = %else, %then + %iftmp = phi double [ %calltmp, %then ], [ %calltmp1, %else ] + ret double %iftmp +} +{% endhighlight %} + + +To visualize the control flow graph, you can use a nifty feature of the LLVM +[opt](http://llvm.org/cmds/opt.html) tool. If you put this LLVM IR +into "t.ll" and run `llvm-as < t.ll | opt -analyze -view-cfg`, a +[window will pop up](http://www.llvm.org/docs/ProgrammersManual.html#ViewGraph) + and you'll see this graph: + +Example CFG + +Another way to get this is to call "`function.viewCFG()`" or +"`function.viewCFGOnly()`" (where F is a "`llvm.core.Function`") +either by inserting actual calls into the code and recompiling or by calling +these in the debugger. LLVM has many nice features for visualizing various +graphs, but note that these are available only if your LLVM was built with +Graphviz support (accomplished by having Graphviz and Ghostview installed when +building LLVM). + +Getting back to the generated code, it is fairly simple: the entry block +evaluates the conditional expression ("x" in our case here) and compares the +result to 0.0 with the +[fcmp](http://www.llvm.org/docs/LangRef.html#i_fcmp) one +instruction ('one' is "Ordered and Not Equal"). Based on the result of this +expression, the code jumps to either the "then" or "else" blocks, which contain +the expressions for the true/false cases. + +Once the then/else blocks are finished executing, they both branch back to +the 'ifcont' block to execute the code that happens after the if/then/else. In +this case the only thing left to do is to return to the caller of the function. +The question then becomes: how does the code know which expression to return? + + +The answer to this question involves an important SSA operation: the +[Phi operation](http://en.wikipedia.org/wiki/Static_single_assignment_form). +If you're not familiar with SSA, +[the wikipedia article](http://en.wikipedia.org/wiki/Static_single_assignment_form) +is a good introduction and there are various other introductions to +it available on your favorite search engine. The short version is that +"execution" of the Phi operation requires "remembering" which block control came +from. The Phi operation takes on the value corresponding to the input control +block. In this case, if control comes in from the "then" block, it gets the +value of "calltmp". If control comes from the "else" block, it gets the value +of "calltmp1". + +At this point, you are probably starting to think "Oh no! This means my +simple and elegant front-end will have to start generating SSA form in order to +use LLVM!". Fortunately, this is not the case, and we strongly advise +*not* implementing an SSA construction algorithm in your front-end +unless there is an amazingly good reason to do so. In practice, there are two +sorts of values that float around in code written for your average imperative +programming language that might need Phi nodes: + +1. Code that involves user variables: `x = 1; x = x + 1;` +2. Values that are implicit in the structure of your AST, + such as the Phi node in this case. + +In [Chapter 7](PythonLangImpl7.html) of this tutorial ("mutable +variables"), we'll talk about #1 in depth. For now, just believe me that you +don't need SSA construction to handle this case. For #2, you have the choice of +using the techniques that we will describe for #1, or you can insert Phi nodes +directly, if convenient. In this case, it is really really easy to generate +the Phi node, so we choose to do it directly. + +Okay, enough of the motivation and overview, lets generate code! + + +## Code Generation for If/Then/Else ## {#ifcodegen} + +In order to generate code for this, we implement the `Codegen` method +for `IfExpressionNode`: + + +{% highlight python %} + def CodeGen(self): + condition = self.condition.CodeGen() + + # Convert condition to a bool by comparing equal to 0.0. + condition_bool = g_llvm_builder.fcmp( + FCMP_ONE, condition, Constant.real(Type.double(), 0), 'ifcond') +{% endhighlight %} + + +This code is straightforward and similar to what we saw before. We emit the +expression for the condition, then compare that value to zero to get a truth +value as a 1-bit (bool) value. + + +{% highlight python %} + function = g_llvm_builder.basic_block.function + + # Create blocks for the then and else cases. Insert the 'then' block at the + # end of the function. + then_block = function.append_basic_block('then') + else_block = function.append_basic_block('else') + merge_block = function.append_basic_block('ifcond') + + g_llvm_builder.cbranch(condition_bool, then_block, else_block) +{% endhighlight %} + + +This code creates the basic blocks that are related to the if/then/else +statement, and correspond directly to the blocks in the example above. The +first line gets the current Function object that is being built. It +gets this by asking the builder for the current BasicBlock, and asking that +block for its "parent" (the function it is currently embedded into). + +Once it has that, it creates three block which are automatically inserted +into the end of the function. Once the blocks are created, we can emit the +conditional branch that chooses between them. Note that creating new blocks +does not implicitly affect the Builder, so it is still inserting into the block +that the condition went into. + + +{% highlight python %} + # Emit then value. + g_llvm_builder.position_at_end(then_block) + then_value = self.then_branch.CodeGen() + g_llvm_builder.branch(merge_block) + + # Codegen of 'Then' can change the current block; update then_block for the + # PHI node. + then_block = g_llvm_builder.basic_block +{% endhighlight %} + + +After the conditional branch is inserted, we move the builder to start +inserting into the "then" block. Strictly speaking, this call moves the +insertion point to be at the end of the specified block. However, since the +"then" block is empty, it also starts out by inserting at the beginning of the +block. :) + +Once the insertion point is set, we recursively codegen the "then" expression +from the AST. To finish off the "then" block, we create an unconditional branch +to the merge block. One interesting (and very important) aspect of the LLVM IR +is that it +[requires all basic blocks to be "terminated"](http://www.llvm.org/docs/LangRef.html#functionstructure) with a +[control flow instruction](http://www.llvm.org/docs/LangRef.html#terminators) + such as return or branch. This means that all control flow, +*including fallthroughs* must be made explicit in the LLVM IR. If you +violate this rule, the verifier will emit an error. + +The final line here is quite subtle, but is very important. The basic issue +is that when we create the Phi node in the merge block, we need to set up the +block/value pairs that indicate how the Phi will work. Importantly, the Phi +node expects to have an entry for each predecessor of the block in the CFG. Why +then, are we getting the current block when we just set it to then_block 5 lines +above? The problem is that the "Then" expression may actually itself change the +block that the Builder is emitting into if, for example, it contains a nested +"if/then/else" expression. Because calling Codegen recursively could +arbitrarily change the notion of the current block, we are required to get an +up-to-date value for code that will set up the Phi node. + + +{% highlight python %} + # Emit else block. + g_llvm_builder.position_at_end(else_block) + else_value = self.else_branch.CodeGen() + g_llvm_builder.branch(merge_block) + + # Codegen of 'Else' can change the current block, update else_block for the + # PHI node. + else_block = g_llvm_builder.basic_block +{% endhighlight %} + + +Code generation for the 'else' block is basically identical to codegen for +the 'then' block. The only significant difference is the first line, which adds +the 'else' block to the function. Recall previously that the 'else' block was +created, but not added to the function. Now that the 'then' and 'else' blocks +are emitted, we can finish up with the merge code: + + +{% highlight python %} + # Emit merge block. + g_llvm_builder.position_at_end(merge_block) + phi = g_llvm_builder.phi(Type.double(), 'iftmp') + phi.add_incoming(then_value, then_block) + phi.add_incoming(else_value, else_block) + + return phi +{% endhighlight %} + + +The first line changes the insertion point so that newly created code will go +into the "merge" block. Once that is done, we need to create the PHI node and +set up the block/value pairs for the PHI. + +Finally, the CodeGen function returns the phi node as the value computed by +the if/then/else expression. In our example above, this returned value will +feed into the code for the top-level function, which will create the return +instruction. + +Overall, we now have the ability to execute conditional code in +Kaleidoscope. With this extension, Kaleidoscope is a fairly complete language +that can calculate a wide variety of numeric functions. Next up we'll add +another useful expression that is familiar from non-functional languages... + + +* * * + +# 'for' Loop Expression # {#for} + + +Now that we know how to add basic control flow constructs to the language, +we have the tools to add more powerful things. Lets add something more +aggressive, a 'for' expression: + + +{% highlight python %} + extern putchard(char) + def printstar(n) + for i = 1, i < n, 1.0 in + putchard(42) # ascii 42 = '*' + + # print 100 '*' characters + printstar(100) +{% endhighlight %} + + +This expression defines a new variable (`i` in this case) which iterates from +a starting value, while the condition (`i < n` in this case) is true, +incrementing by an optional step value ("1.0" in this case). If the step value +is omitted, it defaults to 1.0. While the loop is true, it executes its +body expression. Because we don't have anything better to return, we'll just +define the loop as always returning 0.0. In the future when we have mutable +variables, it will get more useful. + +As before, lets talk about the changes that we need to Kaleidoscope to +support this. + + +## Lexer Extensions for the 'for' Loop ## {#forlexer} + +The lexer extensions are the same sort of thing as for if/then/else: + + +{% highlight python %} +... + +class ThenToken(object): pass +class ElseToken(object): pass +class ForToken(object): pass +class InToken(object): pass + +... + +def Tokenize(string): + + ... + + elif identifier == 'else': + yield ElseToken() + elif identifier == 'for': + yield ForToken() + elif identifier == 'in': + yield InToken() + else: + yield IdentifierToken(identifier) +{% endhighlight %} + + +## AST Extensions for the 'for' Loop ## {#forast} + +The AST node is just as simple. It basically boils down to capturing +the variable name and the constituent expressions in the node. + + +{% highlight python %} +# Expression class for for/in. +class ForExpressionNode(ExpressionNode): + + def __init__(self, loop_variable, start, end, step, body): + self.loop_variable = loop_variable + self.start = start + self.end = end + self.step = step + self.body = body + + def CodeGen(self): + ... +{% endhighlight %} + + +## Parser Extensions for the 'for' Loop ## {#forparser} + +The parser code is also fairly standard. The only interesting thing here is +handling of the optional step value. The parser code handles it by checking to +see if the second comma is present. If not, it sets the step value to null in +the AST node: + + +{% highlight python %} + # forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression + def ParseForExpr(self): + self.Next() # eat the for. + + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected identifier after for.') + + loop_variable = self.current.name + self.Next() # eat the identifier. + + if self.current != CharacterToken('='): + raise RuntimeError('Expected "=" after for variable.') + self.Next() # eat the '='. + + start = self.ParseExpression() + + if self.current != CharacterToken(','): + raise RuntimeError('Expected "," after for start value.') + self.Next() # eat the ','. + + end = self.ParseExpression() + + # The step value is optional. + if self.current == CharacterToken(','): + self.Next() # eat the ','. + step = self.ParseExpression() + else: + step = None + + if not isinstance(self.current, InToken): + raise RuntimeError('Expected "in" after for variable specification.') + self.Next() # eat 'in'. + + body = self.ParseExpression() + + return ForExpressionNode(loop_variable, start, end, step, body) +{% endhighlight %} + + +## LLVM IR for the 'for' Loop ## {#forir} + +Now we get to the good part: the LLVM IR we want to generate for this thing. +With the simple example above, we get this LLVM IR (note that this dump is +generated with optimizations disabled for clarity): + + + +{% highlight llvm %} +declare double @putchard(double) +define double @printstar(double %n) { +entry: + ; initial value = 1.0 (inlined into phi) + br label %loop +loop: ; preds = %loop, %entry + %i = phi double [ 1.000000e+00, %entry ], [ %nextvar, %loop ] + ; body + %calltmp = call double @putchard(double 4.200000e+01) + ; increment + %nextvar = fadd double %i, 1.000000e+00 + ; termination test + %cmptmp = fcmp ult double %i, %n + %booltmp = uitofp i1 %cmptmp to double + %loopcond = fcmp one double %booltmp, 0.000000e+00 + br i1 %loopcond, label %loop, label %afterloop +afterloop: ; preds = %loop + ; loop always returns 0.0 + ret double 0.000000e+00 +} +{% endhighlight %} + + +This loop contains all the same constructs we saw before: a phi node, several +expressions, and some basic blocks. Lets see how this fits together. + + + + +## Code Generation for the 'for' Loop ## {#forcodegen} + +The first part of Codegen is very simple: we just output the start expression +for the loop value: + + +{% highlight python %} + def CodeGen(self): + # Emit the start code first, without 'variable' in scope. + start_value = self.start.CodeGen() +{% endhighlight %} + + +With this out of the way, the next step is to set up the LLVM basic block +for the start of the loop body. In the case above, the whole loop body is one +block, but remember that the body code itself could consist of multiple blocks +(e.g. if it contains an if/then/else or a for/in expression). + + +{% highlight python %} + # Make the new basic block for the loop header, inserting after current + # block. + function = g_llvm_builder.basic_block.function + pre_header_block = g_llvm_builder.basic_block + loop_block = function.append_basic_block('loop') + + # Insert an explicit fallthrough from the current block to the loop_block. + g_llvm_builder.branch(loop_block) +{% endhighlight %} + + +This code is similar to what we saw for if/then/else. Because we will need +it to create the Phi node, we remember the block that falls through into the +loop. Once we have that, we create the actual block that starts the loop and +create an unconditional branch for the fall-through between the two blocks. + + +{% highlight python %} + # Start insertion in loop_block. + g_llvm_builder.position_at_end(loop_block); + + # Start the PHI node with an entry for start. + variable_phi = g_llvm_builder.phi(Type.double(), self.loop_variable) + variable_phi.add_incoming(start_value, pre_header_block) +{% endhighlight %} + + +Now that the "pre_header_block" for the loop is set up, we switch to emitting +code for the loop body. To begin with, we move the insertion point and create +the PHI node for the loop induction variable. Since we already know the +incoming value for the starting value, we add it to the Phi node. Note that the +Phi will eventually get a second value for the backedge, but we can't set it up +yet (because it doesn't exist!). + + +{% highlight python %} + # Within the loop, the variable is defined equal to the PHI node. If it + # shadows an existing variable, we have to restore it, so save it now. + old_value = g_named_values.get(self.loop_variable, None) + g_named_values[self.loop_variable] = variable_phi + + # Emit the body of the loop. This, like any other expr, can change the + # current BB. Note that we ignore the value computed by the body. + self.body.CodeGen() +{% endhighlight %} + + +Now the code starts to get more interesting. Our 'for' loop introduces a new +variable to the symbol table. This means that our symbol table can now contain +either function arguments or loop variables. To handle this, before we codegen +the body of the loop, we add the loop variable as the current value for its +name. Note that it is possible that there is a variable of the same name in the +outer scope. It would be easy to make this an error (emit an error and return +null if there is already an entry for VarName) but we choose to allow shadowing +of variables. In order to handle this correctly, we remember the Value that +we are potentially shadowing in `old_value` (which will be None if there +is no shadowed variable). + +Once the loop variable is set into the symbol table, the code recursively +codegen's the body. This allows the body to use the loop variable: any +references to it will naturally find it in the symbol table. + + +{% highlight python %} + # Emit the step value. + if self.step: + step_value = self.step.CodeGen() + else: + # If not specified, use 1.0. + step_value = Constant.real(Type.double(), 1) + + next_value = g_llvm_builder.fadd(variable_phi, step_value, 'next') +{% endhighlight %} + + +Now that the body is emitted, we compute the next value of the iteration +variable by adding the step value, or 1.0 if it isn't present. +`next_value` will be the value of the loop variable on the next iteration +of the loop. + + +{% highlight python %} + # Compute the end condition and convert it to a bool by comparing to 0.0. + end_condition = self.end.CodeGen() + end_condition_bool = g_llvm_builder.fcmp( + FCMP_ONE, end_condition, Constant.real(Type.double(), 0), 'loopcond') +{% endhighlight %} + + +Finally, we evaluate the exit value of the loop, to determine whether the +loop should exit. This mirrors the condition evaluation for the if/then/else +statement. + + +{% highlight python %} + # Create the "after loop" block and insert it. + loop_end_block = g_llvm_builder.basic_block + after_block = function.append_basic_block('afterloop') + + # Insert the conditional branch into the end of loop_end_block. + g_llvm_builder.cbranch(end_condition_bool, loop_block, after_block) + + # Any new code will be inserted in after_block. + g_llvm_builder.position_at_end(after_block) +{% endhighlight %} + + +With the code for the body of the loop complete, we just need to finish up +the control flow for it. This code remembers the end block (for the phi node), +then creates the block for the loop exit ("afterloop"). Based on the value of +the exit condition, it creates a conditional branch that chooses between +executing the loop again and exiting the loop. Any future code is emitted in +the "afterloop" block, so it sets the insertion position to it. + + +{% highlight python %} + # Add a new entry to the PHI node for the backedge. + variable_phi.add_incoming(next_value, loop_end_block) + + # Restore the unshadowed variable. + if old_value: + g_named_values[self.loop_variable] = old_value + else: + del g_named_values[self.loop_variable] + + # for expr always returns 0.0. + return Constant.real(Type.double(), 0) +{% endhighlight %} + + +The final code handles various cleanups: now that we have the "next_value", +we can add the incoming value to the loop PHI node. After that, we remove the +loop variable from the symbol table, so that it isn't in scope after the for +loop. Finally, code generation of the for loop always returns 0.0, so that is +what we return from `ForExpressionNode::CodeGen`. + +With this, we conclude the "adding control flow to Kaleidoscope" chapter of +the tutorial. In this chapter we added two control flow constructs, and used +them to motivate a couple of aspects of the LLVM IR that are important for +front-end implementors to know. In the next chapter of our saga, we will get a +bit crazier and add [user-defined operators](PythonLangImpl6.html) to +our poor innocent language. + +* * * + +# Full Code Listing # {#code} + +Here is the complete code listing for our running example, enhanced with the +if/then/else and for expressions: + + +{% highlight python %} +#!/usr/bin/env python + +import re +from llvm.core import Module, Constant, Type, Function, Builder +from llvm.ee import ExecutionEngine, TargetData +from llvm.passes import FunctionPassManager + +from llvm.core import FCMP_ULT, FCMP_ONE +from llvm.passes import (PASS_INSTRUCTION_COMBINING, + PASS_REASSOCIATE, + PASS_GVN, + PASS_CFG_SIMPLIFICATION) + +################################################################################ +## Globals +################################################################################ + +# The LLVM module, which holds all the IR code. +g_llvm_module = Module.new('my cool jit') + +# The LLVM instruction builder. Created whenever a new function is entered. +g_llvm_builder = None + +# A dictionary that keeps track of which values are defined in the current scope +# and what their LLVM representation is. +g_named_values = {} + +# The function optimization passes manager. +g_llvm_pass_manager = FunctionPassManager.new(g_llvm_module) + +# The LLVM execution engine. +g_llvm_executor = ExecutionEngine.new(g_llvm_module) + +################################################################################ +## Lexer +################################################################################ + +# The lexer yields one of these types for each token. +class EOFToken(object): pass +class DefToken(object): pass +class ExternToken(object): pass +class IfToken(object): pass +class ThenToken(object): pass +class ElseToken(object): pass +class ForToken(object): pass +class InToken(object): pass + +class IdentifierToken(object): + def __init__(self, name): self.name = name + +class NumberToken(object): + def __init__(self, value): self.value = value + +class CharacterToken(object): + def __init__(self, char): self.char = char + def __eq__(self, other): + return isinstance(other, CharacterToken) and self.char == other.char + def __ne__(self, other): return not self == other + +# Regular expressions that tokens and comments of our language. +REGEX_NUMBER = re.compile('[0-9]+(?:\.[0-9]+)?') +REGEX_IDENTIFIER = re.compile('[a-zA-Z][a-zA-Z0-9]*') +REGEX_COMMENT = re.compile('#.*') + +def Tokenize(string): + while string: + # Skip whitespace. + if string[0].isspace(): + string = string[1:] + continue + + # Run regexes. + comment_match = REGEX_COMMENT.match(string) + number_match = REGEX_NUMBER.match(string) + identifier_match = REGEX_IDENTIFIER.match(string) + + # Check if any of the regexes matched and yield the appropriate result. + if comment_match: + comment = comment_match.group(0) + string = string[len(comment):] + elif number_match: + number = number_match.group(0) + yield NumberToken(float(number)) + string = string[len(number):] + elif identifier_match: + identifier = identifier_match.group(0) + # Check if we matched a keyword. + if identifier == 'def': + yield DefToken() + elif identifier == 'extern': + yield ExternToken() + elif identifier == 'if': + yield IfToken() + elif identifier == 'then': + yield ThenToken() + elif identifier == 'else': + yield ElseToken() + elif identifier == 'for': + yield ForToken() + elif identifier == 'in': + yield InToken() + else: + yield IdentifierToken(identifier) + string = string[len(identifier):] + else: + # Yield the ASCII value of the unknown character. + yield CharacterToken(string[0]) + string = string[1:] + + yield EOFToken() + +################################################################################ +## Abstract Syntax Tree (aka Parse Tree) +################################################################################ + +# Base class for all expression nodes. +class ExpressionNode(object): + pass + +# Expression class for numeric literals like "1.0". +class NumberExpressionNode(ExpressionNode): + + def __init__(self, value): + self.value = value + + def CodeGen(self): + return Constant.real(Type.double(), self.value) + +# Expression class for referencing a variable, like "a". +class VariableExpressionNode(ExpressionNode): + + def __init__(self, name): + self.name = name + + def CodeGen(self): + if self.name in g_named_values: + return g_named_values[self.name] + else: + raise RuntimeError('Unknown variable name: ' + self.name) + +# Expression class for a binary operator. +class BinaryOperatorExpressionNode(ExpressionNode): + + def __init__(self, operator, left, right): + self.operator = operator + self.left = left + self.right = right + + def CodeGen(self): + left = self.left.CodeGen() + right = self.right.CodeGen() + + if self.operator == '+': + return g_llvm_builder.fadd(left, right, 'addtmp') + elif self.operator == '-': + return g_llvm_builder.fsub(left, right, 'subtmp') + elif self.operator == '*': + return g_llvm_builder.fmul(left, right, 'multmp') + elif self.operator == '<': + result = g_llvm_builder.fcmp(FCMP_ULT, left, right, 'cmptmp') + # Convert bool 0 or 1 to double 0.0 or 1.0. + return g_llvm_builder.uitofp(result, Type.double(), 'booltmp') + else: + raise RuntimeError('Unknown binary operator.') + +# Expression class for function calls. +class CallExpressionNode(ExpressionNode): + + def __init__(self, callee, args): + self.callee = callee + self.args = args + + def CodeGen(self): + # Look up the name in the global module table. + callee = g_llvm_module.get_function_named(self.callee) + + # Check for argument mismatch error. + if len(callee.args) != len(self.args): + raise RuntimeError('Incorrect number of arguments passed.') + + arg_values = [i.CodeGen() for i in self.args] + + return g_llvm_builder.call(callee, arg_values, 'calltmp') + +# Expression class for if/then/else. +class IfExpressionNode(ExpressionNode): + + def __init__(self, condition, then_branch, else_branch): + self.condition = condition + self.then_branch = then_branch + self.else_branch = else_branch + + def CodeGen(self): + condition = self.condition.CodeGen() + + # Convert condition to a bool by comparing equal to 0.0. + condition_bool = g_llvm_builder.fcmp( + FCMP_ONE, condition, Constant.real(Type.double(), 0), 'ifcond') + + function = g_llvm_builder.basic_block.function + + # Create blocks for the then and else cases. Insert the 'then' block at the + # end of the function. + then_block = function.append_basic_block('then') + else_block = function.append_basic_block('else') + merge_block = function.append_basic_block('ifcond') + + g_llvm_builder.cbranch(condition_bool, then_block, else_block) + + # Emit then value. + g_llvm_builder.position_at_end(then_block) + then_value = self.then_branch.CodeGen() + g_llvm_builder.branch(merge_block) + + # Codegen of 'Then' can change the current block; update then_block for the + # PHI node. + then_block = g_llvm_builder.basic_block + + # Emit else block. + g_llvm_builder.position_at_end(else_block) + else_value = self.else_branch.CodeGen() + g_llvm_builder.branch(merge_block) + + # Codegen of 'Else' can change the current block, update else_block for the + # PHI node. + else_block = g_llvm_builder.basic_block + + # Emit merge block. + g_llvm_builder.position_at_end(merge_block) + phi = g_llvm_builder.phi(Type.double(), 'iftmp') + phi.add_incoming(then_value, then_block) + phi.add_incoming(else_value, else_block) + + return phi + +# Expression class for for/in. +class ForExpressionNode(ExpressionNode): + + def __init__(self, loop_variable, start, end, step, body): + self.loop_variable = loop_variable + self.start = start + self.end = end + self.step = step + self.body = body + + def CodeGen(self): + # Output this as: + # ... + # start = startexpr + # goto loop + # loop: + # variable = phi [start, loopheader], [nextvariable, loopend] + # ... + # bodyexpr + # ... + # loopend: + # step = stepexpr + # nextvariable = variable + step + # endcond = endexpr + # br endcond, loop, endloop + # outloop: + + # Emit the start code first, without 'variable' in scope. + start_value = self.start.CodeGen() + + # Make the new basic block for the loop header, inserting after current + # block. + function = g_llvm_builder.basic_block.function + pre_header_block = g_llvm_builder.basic_block + loop_block = function.append_basic_block('loop') + + # Insert an explicit fallthrough from the current block to the loop_block. + g_llvm_builder.branch(loop_block) + + # Start insertion in loop_block. + g_llvm_builder.position_at_end(loop_block) + + # Start the PHI node with an entry for start. + variable_phi = g_llvm_builder.phi(Type.double(), self.loop_variable) + variable_phi.add_incoming(start_value, pre_header_block) + + # Within the loop, the variable is defined equal to the PHI node. If it + # shadows an existing variable, we have to restore it, so save it now. + old_value = g_named_values.get(self.loop_variable, None) + g_named_values[self.loop_variable] = variable_phi + + # Emit the body of the loop. This, like any other expr, can change the + # current BB. Note that we ignore the value computed by the body. + self.body.CodeGen() + + # Emit the step value. + if self.step: + step_value = self.step.CodeGen() + else: + # If not specified, use 1.0. + step_value = Constant.real(Type.double(), 1) + + next_value = g_llvm_builder.fadd(variable_phi, step_value, 'next') + + # Compute the end condition and convert it to a bool by comparing to 0.0. + end_condition = self.end.CodeGen() + end_condition_bool = g_llvm_builder.fcmp( + FCMP_ONE, end_condition, Constant.real(Type.double(), 0), 'loopcond') + + # Create the "after loop" block and insert it. + loop_end_block = g_llvm_builder.basic_block + after_block = function.append_basic_block('afterloop') + + # Insert the conditional branch into the end of loop_end_block. + g_llvm_builder.cbranch(end_condition_bool, loop_block, after_block) + + # Any new code will be inserted in after_block. + g_llvm_builder.position_at_end(after_block) + + # Add a new entry to the PHI node for the backedge. + variable_phi.add_incoming(next_value, loop_end_block) + + # Restore the unshadowed variable. + if old_value: + g_named_values[self.loop_variable] = old_value + else: + del g_named_values[self.loop_variable] + + # for expr always returns 0.0. + return Constant.real(Type.double(), 0) + +# This class represents the "prototype" for a function, which captures its name, +# and its argument names (thus implicitly the number of arguments the function +# takes). +class PrototypeNode(object): + + def __init__(self, name, args): + self.name = name + self.args = args + + def CodeGen(self): + # Make the function type, eg. double(double,double). + funct_type = Type.function( + Type.double(), [Type.double()] * len(self.args), False) + + function = Function.new(g_llvm_module, funct_type, self.name) + + # If the name conflicted, there was already something with the same name. + # If it has a body, don't allow redefinition or reextern. + if function.name != self.name: + function.delete() + function = g_llvm_module.get_function_named(self.name) + + # If the function already has a body, reject this. + if not function.is_declaration: + raise RuntimeError('Redefinition of function.') + + # If the function took a different number of args, reject. + if len(function.args) != len(self.args): + raise RuntimeError('Redeclaration of a function with different number ' + 'of args.') + + # Set names for all arguments and add them to the variables symbol table. + for arg, arg_name in zip(function.args, self.args): + arg.name = arg_name + # Add arguments to variable symbol table. + g_named_values[arg_name] = arg + + return function + +# This class represents a function definition itself. +class FunctionNode(object): + + def __init__(self, prototype, body): + self.prototype = prototype + self.body = body + + def CodeGen(self): + # Clear scope. + g_named_values.clear() + + # Create a function object. + function = self.prototype.CodeGen() + + # Create a new basic block to start insertion into. + block = function.append_basic_block('entry') + global g_llvm_builder + g_llvm_builder = Builder.new(block) + + # Finish off the function. + try: + return_value = self.body.CodeGen() + g_llvm_builder.ret(return_value) + + # Validate the generated code, checking for consistency. + function.verify() + + # Optimize the function. + g_llvm_pass_manager.run(function) + except: + function.delete() + raise + + return function + + +################################################################################ +## Parser +################################################################################ + +class Parser(object): + + def __init__(self, tokens, binop_precedence): + self.tokens = tokens + self.binop_precedence = binop_precedence + self.Next() + + # Provide a simple token buffer. Parser.current is the current token the + # parser is looking at. Parser.Next() reads another token from the lexer and + # updates Parser.current with its results. + def Next(self): + self.current = self.tokens.next() + + # Gets the precedence of the current token, or -1 if the token is not a binary + # operator. + def GetCurrentTokenPrecedence(self): + if isinstance(self.current, CharacterToken): + return self.binop_precedence.get(self.current.char, -1) + else: + return -1 + + # identifierexpr ::= identifier | identifier '(' expression* ')' + def ParseIdentifierExpr(self): + identifier_name = self.current.name + self.Next() # eat identifier. + + if self.current != CharacterToken('('): # Simple variable reference. + return VariableExpressionNode(identifier_name) + + # Call. + self.Next() # eat '('. + args = [] + if self.current != CharacterToken(')'): + while True: + args.append(self.ParseExpression()) + if self.current == CharacterToken(')'): + break + elif self.current != CharacterToken(','): + raise RuntimeError('Expected ")" or "," in argument list.') + self.Next() + + self.Next() # eat ')'. + return CallExpressionNode(identifier_name, args) + + # numberexpr ::= number + def ParseNumberExpr(self): + result = NumberExpressionNode(self.current.value) + self.Next() # consume the number. + return result + + # parenexpr ::= '(' expression ')' + def ParseParenExpr(self): + self.Next() # eat '('. + + contents = self.ParseExpression() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")".') + self.Next() # eat ')'. + + return contents + + # ifexpr ::= 'if' expression 'then' expression 'else' expression + def ParseIfExpr(self): + self.Next() # eat the if. + + # condition. + condition = self.ParseExpression() + + if not isinstance(self.current, ThenToken): + raise RuntimeError('Expected "then".') + self.Next() # eat the then. + + then_branch = self.ParseExpression() + + if not isinstance(self.current, ElseToken): + raise RuntimeError('Expected "else".') + self.Next() # eat the else. + + else_branch = self.ParseExpression() + + return IfExpressionNode(condition, then_branch, else_branch) + + # forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression + def ParseForExpr(self): + self.Next() # eat the for. + + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected identifier after for.') + + loop_variable = self.current.name + self.Next() # eat the identifier. + + if self.current != CharacterToken('='): + raise RuntimeError('Expected "=" after for variable.') + self.Next() # eat the '='. + + start = self.ParseExpression() + + if self.current != CharacterToken(','): + raise RuntimeError('Expected "," after for start value.') + self.Next() # eat the ','. + + end = self.ParseExpression() + + # The step value is optional. + if self.current == CharacterToken(','): + self.Next() # eat the ','. + step = self.ParseExpression() + else: + step = None + + if not isinstance(self.current, InToken): + raise RuntimeError('Expected "in" after for variable specification.') + self.Next() # eat 'in'. + + body = self.ParseExpression() + + return ForExpressionNode(loop_variable, start, end, step, body) + + # primary ::= identifierexpr | numberexpr | parenexpr | ifexpr | forexpr + def ParsePrimary(self): + if isinstance(self.current, IdentifierToken): + return self.ParseIdentifierExpr() + elif isinstance(self.current, NumberToken): + return self.ParseNumberExpr() + elif isinstance(self.current, IfToken): + return self.ParseIfExpr() + elif isinstance(self.current, ForToken): + return self.ParseForExpr() + elif self.current == CharacterToken('('): + return self.ParseParenExpr() + else: + raise RuntimeError('Unknown token when expecting an expression.') + + # binoprhs ::= (operator primary)* + def ParseBinOpRHS(self, left, left_precedence): + # If this is a binary operator, find its precedence. + while True: + precedence = self.GetCurrentTokenPrecedence() + + # If this is a binary operator that binds at least as tightly as the + # current one, consume it; otherwise we are done. + if precedence < left_precedence: + return left + + binary_operator = self.current.char + self.Next() # eat the operator. + + # Parse the primary expression after the binary operator. + right = self.ParsePrimary() + + # If binary_operator binds less tightly with right than the operator after + # right, let the pending operator take right as its left. + next_precedence = self.GetCurrentTokenPrecedence() + if precedence < next_precedence: + right = self.ParseBinOpRHS(right, precedence + 1) + + # Merge left/right. + left = BinaryOperatorExpressionNode(binary_operator, left, right) + + # expression ::= primary binoprhs + def ParseExpression(self): + left = self.ParsePrimary() + return self.ParseBinOpRHS(left, 0) + + # prototype ::= id '(' id* ')' + def ParsePrototype(self): + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected function name in prototype.') + + function_name = self.current.name + self.Next() # eat function name. + + if self.current != CharacterToken('('): + raise RuntimeError('Expected "(" in prototype.') + self.Next() # eat '('. + + arg_names = [] + while isinstance(self.current, IdentifierToken): + arg_names.append(self.current.name) + self.Next() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")" in prototype.') + + # Success. + self.Next() # eat ')'. + + return PrototypeNode(function_name, arg_names) + + # definition ::= 'def' prototype expression + def ParseDefinition(self): + self.Next() # eat def. + proto = self.ParsePrototype() + body = self.ParseExpression() + return FunctionNode(proto, body) + + # toplevelexpr ::= expression + def ParseTopLevelExpr(self): + proto = PrototypeNode('', []) + return FunctionNode(proto, self.ParseExpression()) + + # external ::= 'extern' prototype + def ParseExtern(self): + self.Next() # eat extern. + return self.ParsePrototype() + + # Top-Level parsing + def HandleDefinition(self): + self.Handle(self.ParseDefinition, 'Read a function definition:') + + def HandleExtern(self): + self.Handle(self.ParseExtern, 'Read an extern:') + + def HandleTopLevelExpression(self): + try: + function = self.ParseTopLevelExpr().CodeGen() + result = g_llvm_executor.run_function(function, []) + print 'Evaluated to:', result.as_real(Type.double()) + except Exception, e: + print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + + def Handle(self, function, message): + try: + print message, function().CodeGen() + except Exception, e: + print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + +################################################################################ +## Main driver code. +################################################################################ + +def main(): + # Set up the optimizer pipeline. Start with registering info about how the + # target lays out data structures. + g_llvm_pass_manager.add(g_llvm_executor.target_data) + # Do simple "peephole" optimizations and bit-twiddling optzns. + g_llvm_pass_manager.add(PASS_INSTRUCTION_COMBINING) + # Reassociate expressions. + g_llvm_pass_manager.add(PASS_REASSOCIATE) + # Eliminate Common SubExpressions. + g_llvm_pass_manager.add(PASS_GVN) + # Simplify the control flow graph (deleting unreachable blocks, etc). + g_llvm_pass_manager.add(PASS_CFG_SIMPLIFICATION) + + g_llvm_pass_manager.initialize() + + # Install standard binary operators. + # 1 is lowest possible precedence. 40 is the highest. + operator_precedence = { + '<': 10, + '+': 20, + '-': 20, + '*': 40 + } + + # Run the main "interpreter loop". + while True: + print 'ready>', + try: + raw = raw_input() + except KeyboardInterrupt: + break + + parser = Parser(Tokenize(raw), operator_precedence) + while True: + # top ::= definition | external | expression | EOF + if isinstance(parser.current, EOFToken): + break + if isinstance(parser.current, DefToken): + parser.HandleDefinition() + elif isinstance(parser.current, ExternToken): + parser.HandleExtern() + else: + parser.HandleTopLevelExpression() + + # Print out all of the generated code. + print '\n', g_llvm_module + +if __name__ == '__main__': + main() +{% endhighlight %} + +* * * + +**[Next: Extending the language: user-defined operators](PythonLangImpl6.html)** + diff --git a/docs/source/doc/kaleidoscope/PythonLangImpl6.md b/docs/source/doc/kaleidoscope/PythonLangImpl6.md new file mode 100644 index 0000000..74385cb --- /dev/null +++ b/docs/source/doc/kaleidoscope/PythonLangImpl6.md @@ -0,0 +1,1535 @@ +--- +layout: page +title: "Kaleidoscope: Chapter 6" +--- + +# Extending the Language: User-defined Operators + +Written by [Chris Lattner](mailto:sabre@nondot.org) +and [Max Shawabkeh](http://max99x.com) + + +**Chapter 6** + + +* This will become a table of contents (this text will be scraped). +{:toc} + + +**[Chapter 7: Extending the Language: Mutable +Variables / SSA Construction](PythonLangImpl7.html)** + + +# Introduction # {#intro} + + +Welcome to Chapter 6 of the +[Implementing a language with LLVM](http://www.llvm.org/docs/tutorial/index.html) tutorial. At this point in our tutorial, we now have a fully +functional language that is fairly minimal, but also useful. There +is still one big problem with it, however. Our language doesn't have many +useful operators (like division, logical negation, or even any comparisons +besides less-than). + +This chapter of the tutorial takes a wild digression into adding user-defined +operators to the simple and beautiful Kaleidoscope language. This digression now +gives us a simple and ugly language in some ways, but also a powerful one at the +same time. One of the great things about creating your own language is that you +get to decide what is good or bad. In this tutorial we'll assume that it is +okay to use this as a way to show some interesting parsing techniques. + +At the end of this tutorial, we'll run through an example Kaleidoscope +application that [renders the Mandelbrot set](#example). This gives +an example of what you can build with Kaleidoscope and its feature set. + +# User-defined Operators: the Idea # {#idea} + +The "operator overloading" that we will add to Kaleidoscope is more general than +languages like C++. In C++, you are only allowed to redefine existing +operators: you can't programatically change the grammar, introduce new +operators, change precedence levels, etc. In this chapter, we will add this +capability to Kaleidoscope, which will let the user round out the set of +operators that are supported. + +The point of going into user-defined operators in a tutorial like this is to +show the power and flexibility of using a hand-written parser. Thus far, the +parser we have been implementing uses recursive descent for most parts of the +grammar and operator precedence parsing for the expressions. +See [Chapter 2](PythonLangImpl2.html) for details. Without using operator +precedence parsing, it would be very difficult to allow the programmer to +introduce new operators into the grammar: the grammar is dynamically extensible +as the JIT runs. + +The two specific features we'll add are programmable unary operators (right +now, Kaleidoscope has no unary operators at all) as well as binary operators. +An example of this is: + + +{% highlight python %} +# Logical unary not. +def unary!(v) + if v then + 0 + else + 1 + +# Define > with the same precedence as <. +def binary> 10 (LHS RHS) + RHS < LHS + +# Binary "logical or", (note that it does not "short circuit"). +def binary| 5 (LHS RHS) + if LHS then + 1 + else if RHS then + 1 + else + 0 + +# Define = with slightly lower precedence than relationals. +def binary= 9 (LHS RHS) + !(LHS < RHS | LHS > RHS) +{% endhighlight %} + + +Many languages aspire to being able to implement their standard runtime +library in the language itself. In Kaleidoscope, we can implement significant +parts of the language in the library! + +We will break down implementation of these features into two parts: +implementing support for user-defined binary operators and adding unary +operators. + +* * * + +# User-defined Binary Operators # {#binary} + +Adding support for user-defined binary operators is pretty simple with our +current framework. We'll first add support for the unary/binary keywords: + + +{% highlight python %} +class InToken(object): pass +class BinaryToken(object): pass +class UnaryToken(object): pass +... +def Tokenize(string): + ... + elif identifier == 'in': + yield InToken() + elif identifier == 'binary': + yield BinaryToken() + elif identifier == 'unary': + yield UnaryToken() + else: + yield IdentifierToken(identifier) +{% endhighlight %} + + +This just adds lexer support for the unary and binary keywords, like we +did in [previous chapters](PythonLangImpl5.html#iflexer). One nice +thing about our current AST, is that we represent binary operators with full +generalisation by using their ASCII code as the opcode. For our extended +operators, we'll use this same representation, so we don't need any new AST or +parser support. + +On the other hand, we have to be able to represent the definitions of these +new operators, in the "def binary| 5" part of the function definition. In our +grammar so far, the "name" for the function definition is parsed as the +"prototype" production and into the `PrototypeNode`. To represent our +new user-defined operators as prototypes, we have to extend the +`PrototypeNode` like this: + + +{% highlight python %} +# This class represents the "prototype" for a function, which captures its name, +# and its argument names (thus implicitly the number of arguments the function +# takes), as well as if it is an operator. +class PrototypeNode(object): + + def __init__(self, name, args, is_operator=False, precedence=0): + self.name = name + self.args = args + self.is_operator = is_operator + self.precedence = precedence + + def IsBinaryOp(self): + return self.is_operator and len(self.args) == 2 + + def GetOperatorName(self): + assert self.is_operator + return self.name[-1] + + def CodeGen(self): + ... +{% endhighlight %} + + +Basically, in addition to knowing a name for the prototype, we now keep track +of whether it was an operator, and if it was, what precedence level the operator +is at. The precedence is only used for binary operators (as you'll see below, +it just doesn't apply for unary operators). Now that we have a way to represent +the prototype for a user-defined operator, we need to parse it: + + +{% highlight python %} + # prototype + # ::= id '(' id* ')' + # ::= binary LETTER number? (id, id) + # ::= unary LETTER (id) + def ParsePrototype(self): + precedence = None + if isinstance(self.current, IdentifierToken): + kind = 'normal' + function_name = self.current.name + self.Next() # eat function name. + elif isinstance(self.current, BinaryToken): + kind = 'binary' + self.Next() # eat 'binary'. + if not isinstance(self.current, CharacterToken): + raise RuntimeError('Expected an operator after "binary".') + function_name = 'binary' + self.current.char + self.Next() # eat the operator. + if isinstance(self.current, NumberToken): + if not 1 <= self.current.value <= 100: + raise RuntimeError('Invalid precedence: must be in range [1, 100].') + precedence = self.current.value + self.Next() # eat the precedence. + else: + raise RuntimeError('Expected function name, "unary" or "binary" in ' + 'prototype.') + + if self.current != CharacterToken('('): + raise RuntimeError('Expected "(" in prototype.') + self.Next() # eat '('. + + arg_names = [] + while isinstance(self.current, IdentifierToken): + arg_names.append(self.current.name) + self.Next() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")" in prototype.') + + # Success. + self.Next() # eat ')'. + + if kind == 'binary' and len(arg_names) != 2: + raise RuntimeError('Invalid number of arguments for a binary operator.') + + return PrototypeNode(function_name, arg_names, kind != 'normal', precedence) + +{% endhighlight %} + + +This is all fairly straightforward parsing code, and we have already seen +a lot of similar code in the past. One interesting part about the code above is +the couple lines that set up `function_name` for operators. This builds +names like "binary@" for a newly defined "@" operator. This then takes +advantage of the fact that symbol names in the LLVM symbol table are allowed to +have any character in them. + +The next interesting thing to add, is codegen support for these binary +operators. Given our current structure, this is a simple addition of a default +case for our existing binary operator node: + + +{% highlight python %} + def CodeGen(self): + left = self.left.CodeGen() + right = self.right.CodeGen() + + if self.operator == '+': + return g_llvm_builder.fadd(left, right, 'addtmp') + elif self.operator == '-': + return g_llvm_builder.fsub(left, right, 'subtmp') + elif self.operator == '*': + return g_llvm_builder.fmul(left, right, 'multmp') + elif self.operator == '<': + result = g_llvm_builder.fcmp(FCMP_ULT, left, right, 'cmptmp') + # Convert bool 0 or 1 to double 0.0 or 1.0. + return g_llvm_builder.uitofp(result, Type.double(), 'booltmp') + else: + function = g_llvm_module.get_function_named('binary' + self.operator) + return g_llvm_builder.call(function, [left, right], 'binop') +{% endhighlight %} + + +As you can see above, the new code is actually really simple. It just does +a lookup for the appropriate operator in the symbol table and generates a +function call to it. Since user-defined operators are just built as normal +functions (because the "prototype" boils down to a function with the right +name) everything falls into place. + +The final piece of code we are missing, is a bit of top-level magic. We will +need to make the dinary precedence map global and modify it whenever we define a +new binary operator: + + +{% highlight python %} +# The binary operator precedence chart. +g_binop_precedence = {} +... +class FunctionNode(object): + ... + def CodeGen(self): + ... + # Create a function object. + function = self.prototype.CodeGen() + + # If this is a binary operator, install its precedence. + if self.prototype.IsBinaryOp(): + operator = self.prototype.GetOperatorName() + g_binop_precedence[operator] = self.prototype.precedence + ... + # Finish off the function. + try: + ... + except: + function.delete() + if self.prototype.IsBinaryOp(): + del g_binop_precedence[self.prototype.GetOperatorName()] + raise + + return function +... +def main(): + ... + g_binop_precedence['<'] = 10 + g_binop_precedence['+'] = 20 + g_binop_precedence['-'] = 20 + g_binop_precedence['*'] = 40 + ... +{% endhighlight %} + + +Basically, before CodeGening a function, if it is a user-defined operator, we +register it in the precedence table. This allows the binary operator parsing +logic we already have in place to handle it. Since we are working on a +fully-general operator precedence parser, this is all we need to do to "extend +the grammar". + +Now we have useful user-defined binary operators. This builds a lot +on the previous framework we built for other operators. Adding unary operators +is a bit more challenging, because we don't have any framework for it yet - +let's see what it takes. + +# User-defined Unary Operators # {#unary} + +Since we don't currently support unary operators in the Kaleidoscope +language, we'll need to add everything to support them. Above, we added simple +support for the 'unary' keyword to the lexer. In addition to that, we need an +AST node: + + +{% highlight python %} +# Expression class for a unary operator. +class UnaryExpressionNode(ExpressionNode): + + def __init__(self, operator, operand): + self.operator = operator + self.operand = operand + + def CodeGen(self): + ... +{% endhighlight %} + + +This AST node is very simple and obvious by now. It directly mirrors the +binary operator AST node, except that it only has one child. With this, we +need to add the parsing logic. Parsing a unary operator is pretty simple: we'll +add a new function to do it: + + +{% highlight python %} + # unary ::= primary | unary_operator unary + def ParseUnary(self): + # If the current token is not an operator, it must be a primary expression. + if (not isinstance(self.current, CharacterToken) or + self.current in [CharacterToken('('), CharacterToken(',')]): + return self.ParsePrimary() + + # If this is a unary operator, read it. + operator = self.current.char + self.Next() # eat the operator. + return UnaryExpressionNode(operator, self.ParseUnary()) +{% endhighlight %} + + +The grammar we add is pretty straightforward here. If we see a unary +operator when parsing a primary operator, we eat the operator as a prefix and +parse the remaining piece as another unary operator. This allows us to handle +multiple unary operators (e.g. `!!x`). Note that unary operators can't have +ambiguous parses like binary operators can, so there is no need for precedence +information. + +The problem with this function, is that we need to call ParseUnary from +somewhere. To do this, we change previous callers of ParsePrimary to call +ParseUnary instead: + + +{% highlight python %} + # binoprhs ::= (binary_operator unary)* + def ParseBinOpRHS(self, left, left_precedence): + ... + # Parse the unary expression after the binary operator. + right = self.ParseUnary() + ... + + # expression ::= unary binoprhs + def ParseExpression(self): + left = self.ParseUnary() + return self.ParseBinOpRHS(left, 0) +{% endhighlight %} + + +With these two simple changes, we are now able to parse unary operators and +build the AST for them. Next up, we need to add parser support for prototypes, +to parse the unary operator prototype. We extend the binary operator code above +with: + + +{% highlight python %} + # prototype + # ::= id '(' id* ')' + # ::= binary LETTER number? (id, id) + # ::= unary LETTER (id) + def ParsePrototype(self): + precedence = None + if isinstance(self.current, IdentifierToken): + ... + elif isinstance(self.current, UnaryToken): + kind = 'unary' + self.Next() # eat 'unary'. + if not isinstance(self.current, CharacterToken): + raise RuntimeError('Expected an operator after "unary".') + function_name = 'unary' + self.current.char + self.Next() # eat the operator. + elif isinstance(self.current, BinaryToken): + ... + else: + raise RuntimeError('Expected function name, "unary" or "binary" in ' + 'prototype.') + ... + if kind == 'unary' and len(arg_names) != 1: + raise RuntimeError('Invalid number of arguments for a unary operator.') + elif kind == 'binary' and len(arg_names) != 2: + raise RuntimeError('Invalid number of arguments for a binary operator.') + + return PrototypeNode(function_name, arg_names, kind != 'normal', precedence) +{% endhighlight %} + + +As with binary operators, we name unary operators with a name that includes +the operator character. This assists us at code generation time. Speaking of, +the final piece we need to add is codegen support for unary operators. It looks +like this: + + +{% highlight python %} +class UnaryExpressionNode(ExpressionNode): + ... + def CodeGen(self): + operand = self.operand.CodeGen() + function = g_llvm_module.get_function_named('unary' + self.operator) + return g_llvm_builder.call(function, [operand], 'unop') +{% endhighlight %} + + +This code is similar to, but simpler than, the code for binary operators. It +is simpler primarily because it doesn't need to handle any predefined operators. + +* * * + +# Kicking the Tires # {#example} + +It is somewhat hard to believe, but with a few simple extensions we've +covered in the last chapters, we have grown a real-ish language. With this, we +can do a lot of interesting things, including I/O, math, and a bunch of other +things. For example, we can now add a nice sequencing operator (assuming we +import `putchard` as described in +Chapter +4): + + +{% highlight python %} +ready> def binary : 1 (x y) 0 # Low-precedence operator that ignores operands. +... +ready> extern putchard(x) +... +ready> def printd(x) putchard(x) : putchard(10) +.. +ready> printd(65) : printd(66) : printd(67) +A +B +C +Evaluated to: 0.0 +{% endhighlight %} + + +We can also define a bunch of other "primitive" operations, such as: + + +{% highlight python %} +# Logical unary not. +def unary!(v) + if v then + 0 + else + 1 + +# Unary negate. +def unary-(v) + 0-v + +# Define > with the same precedence as <. +def binary> 10 (LHS RHS) + RHS < LHS + +# Binary logical or, which does not short circuit. +def binary| 5 (LHS RHS) + if LHS then + 1 + else if RHS then + 1 + else + 0 + +# Binary logical and, which does not short circuit. +def binary& 6 (LHS RHS) + if !LHS then + 0 + else + !!RHS + +# Define = with slightly lower precedence than relationals. +def binary = 9 (LHS RHS) + !(LHS < RHS | LHS > RHS) + +{% endhighlight %} + + + +Given the previous if/then/else support, we can also define interesting +functions for I/O. For example, the following prints out a character whose +"density" reflects the value passed in: the lower the value, the denser the +character: + + +{% highlight python %} +ready> + +extern putchard(char) +def printdensity(d) + if d > 8 then + putchard(32) # ' ' + else if d > 4 then + putchard(46) # '.' + else if d > 2 then + putchard(43) # '+' + else + putchard(42); # '*' +... +ready> printdensity(1): printdensity(2): printdensity(3) : + printdensity(4): printdensity(5): printdensity(9): putchard(10) +*++.. +Evaluated to 0.000000 +{% endhighlight %} + + +Based on these simple primitive operations, we can start to define more +interesting things. For example, here's a little function that solves for the +number of iterations it takes a function in the complex plane to +converge: + + +{% highlight python %} +# determine whether the specific location diverges. +# Solve for z = z^2 + c in the complex plane. +def mandelconverger(real imag iters creal cimag) + if iters > 255 | (real*real + imag*imag > 4) then + iters + else + mandelconverger(real*real - imag*imag + creal, + 2*real*imag + cimag, + iters+1, creal, cimag) + +# return the number of iterations required for the iteration to escape +def mandelconverge(real imag) + mandelconverger(real, imag, 0, real, imag) +{% endhighlight %} + + +This "z = z2 + c" function is a beautiful little creature that is +the basis for computation of the +[Mandelbrot Set](http://en.wikipedia.org/wiki/Mandelbrot_set). Our +`mandelconverge` function returns the number of iterations that it takes +for a complex orbit to escape, saturating to 255. This is not a very useful +function by itself, but if you plot its value over a two-dimensional plane, +you can see the Mandelbrot set. Given that we are limited to using putchard +here, our amazing graphical output is limited, but we can whip together +something using the density plotter above: + + +{% highlight python %} +# compute and plot the mandlebrot set with the specified 2 dimensional range +# info. +def mandelhelp(xmin xmax xstep ymin ymax ystep) + for y = ymin, y < ymax, ystep in ( + (for x = xmin, x < xmax, xstep in + printdensity(mandleconverge(x,y))) + : putchard(10) + ) + +# mandel - This is a convenient helper function for ploting the mandelbrot set +# from the specified position with the specified Magnification. +def mandel(realstart imagstart realmag imagmag) + mandelhelp(realstart, realstart+realmag*78, realmag, + imagstart, imagstart+imagmag*40, imagmag); +{% endhighlight %} + + +Given this, we can try plotting out the mandlebrot set! Lets try it out: + + +{% highlight bash %} +ready> mandel(-2.3, -1.3, 0.05, 0.07) +******************************************************************************* +******************************************************************************* +****************************************++++++********************************* +************************************+++++...++++++***************************** +*********************************++++++++.. ...+++++*************************** +*******************************++++++++++.. ..+++++************************** +******************************++++++++++. ..++++++************************* +****************************+++++++++.... ..++++++************************ +**************************++++++++....... .....++++*********************** +*************************++++++++. . ... .++********************** +***********************++++++++... ++********************** +*********************+++++++++.... .+++********************* +******************+++..+++++.... ..+++******************** +**************++++++. .......... +++******************** +***********++++++++.. .. .++******************** +*********++++++++++... .++++******************* +********++++++++++.. .++++******************* +*******++++++..... ..++++******************* +*******+........ ...++++******************* +*******+... .... ...++++******************* +*******+++++...... ..++++******************* +*******++++++++++... .++++******************* +*********++++++++++... ++++******************* +**********+++++++++.. .. ..++******************** +*************++++++.. .......... +++******************** +******************+++...+++..... ..+++******************** +*********************+++++++++.... ..++********************* +***********************++++++++... +++********************* +*************************+++++++.. . ... .++********************** +**************************++++++++....... ......+++*********************** +****************************+++++++++.... ..++++++************************ +*****************************++++++++++.. ..++++++************************* +*******************************++++++++++.. ...+++++************************** +*********************************++++++++.. ...+++++*************************** +***********************************++++++....+++++***************************** +***************************************++++++++******************************** +******************************************************************************* +******************************************************************************* +******************************************************************************* +******************************************************************************* +******************************************************************************* +Evaluated to 0.0 +ready> mandel(-2, -1, 0.02, 0.04) +******************************************************************+++++++++++++ +****************************************************************+++++++++++++++ +*************************************************************++++++++++++++++++ +***********************************************************++++++++++++++++++++ +********************************************************+++++++++++++++++++++++ +******************************************************++++++++++++++++++++++... +***************************************************+++++++++++++++++++++....... +*************************************************++++++++++++++++++++.......... +***********************************************+++++++++++++++++++... ... +********************************************++++++++++++++++++++...... +******************************************++++++++++++++++++++....... +***************************************+++++++++++++++++++++.......... +************************************++++++++++++++++++++++........... +********************************++++++++++++++++++++++++......... +***************************++++++++...........+++++.............. +*********************++++++++++++.... ......................... +***************+++++++++++++++++.... ......... ............ +***********+++++++++++++++++++++..... ...... +********+++++++++++++++++++++++....... +******+++++++++++++++++++++++++........ +****+++++++++++++++++++++++++....... +***+++++++++++++++++++++++......... +**++++++++++++++++........... +*++++++++++++................ +*++++.................... + +*++++.................... +*++++++++++++................ +**++++++++++++++++........... +***+++++++++++++++++++++++......... +****+++++++++++++++++++++++++....... +******+++++++++++++++++++++++++........ +********+++++++++++++++++++++++....... +***********+++++++++++++++++++++..... ...... +***************+++++++++++++++++.... ......... ............ +*********************++++++++++++.... ......................... +***************************++++++++...........+++++.............. +********************************++++++++++++++++++++++++......... +************************************++++++++++++++++++++++........... +***************************************+++++++++++++++++++++.......... +******************************************++++++++++++++++++++....... +Evaluated to: 0.0 +ready> mandel(-0.9, -1.4, 0.02, 0.03) +******************************************************************************* +******************************************************************************* +******************************************************************************* +******************************************************************************* +******************************************************************************* +******************************************************************************* +******************************************************************************* +******************************************************************************* +****************************+++++++++++++++++********************************** +***********************+++++++++++...++++++++++++****************************** +********************+++++++++++++.. . .++++++++++++++************************** +*****************++++++++++++++++... ......++++++++++++************************ +**************+++++++++++++++++++... .......+++++++++++********************** +************++++++++++++++++++++.... .... ..++++++++++++******************** +**********++++++++++++++++++++++...... ...++++++++++++******************* +********+++++++++++++++++++++++....... .....++++++++++++++***************** +******++++++++++++++++++++++++....... .....+++++++++++++++**************** +****+++++++++++++++++++++++++.... . .....+++++++++++++++*************** +**+++++++++++++++++++++++++.... ...++++++++++++++++************* +*+++++++++++++++++++++++....... ....++++++++++++++++************ ++++++++++++++++++++++.......... .....++++++++++++++++*********** +++++++++++++++++++............. .......+++++++++++++++********** ++++++++++++++++................ ............++++++++++********** ++++++++++++++................. .................+++++********* ++++++++++++... .... .......... .+++++******** +++++++++++..... ........ ...+++++******* +++++++++...... ..++++++****** ++++++++........ ..+++++****** ++++++.......... ..++++++***** +++++.......... ....++++++***** +++.......... ....+++++++**** +.......... ......+++++++*** +.......... .....+++++++*** +.......... .....++++++*** +......... .+++++++** +........ .+++++++** + ...... ...+++++++* + . ....++++++++* + ...++++++++* + ..+++++++++ + ..+++++++++ +Evaluated to: 0.0 +ready> ^C +{% endhighlight %} + + +At this point, you may be starting to realize that Kaleidoscope is a real +and powerful language. It may not be self-similar :), but it can be used to +plot things that are! + +With this, we conclude the "adding user-defined operators" chapter of the +tutorial. We have successfully augmented our language, adding the ability to extend the +language in the library, and we have shown how this can be used to build a simple but +interesting end-user application in Kaleidoscope. At this point, Kaleidoscope +can build a variety of applications that are functional and can call functions +with side-effects, but it can't actually define and mutate a variable itself. + + +Strikingly, variable mutation is an important feature of some +languages, and it is not at all obvious how to +[add support for mutable variables](PythonLangImpl7.html) without +having to add an "SSA construction" phase to your front-end. In the next +chapter, we will describe how you can add variable mutation without building SSA +in your front-end. + +* * * + +# Full Code Listing # {#code} + +Here is the complete code listing for our running example, enhanced with the +if/then/else and for expressions: + + +{% highlight python %} +#!/usr/bin/env python + +import re +from llvm.core import Module, Constant, Type, Function, Builder +from llvm.ee import ExecutionEngine, TargetData +from llvm.passes import FunctionPassManager + +from llvm.core import FCMP_ULT, FCMP_ONE +from llvm.passes import (PASS_INSTRUCTION_COMBINING, + PASS_REASSOCIATE, + PASS_GVN, + PASS_CFG_SIMPLIFICATION) + +################################################################################ +## Globals +################################################################################ + +# The LLVM module, which holds all the IR code. +g_llvm_module = Module.new('my cool jit') + +# The LLVM instruction builder. Created whenever a new function is entered. +g_llvm_builder = None + +# A dictionary that keeps track of which values are defined in the current scope +# and what their LLVM representation is. +g_named_values = {} + +# The function optimization passes manager. +g_llvm_pass_manager = FunctionPassManager.new(g_llvm_module) + +# The LLVM execution engine. +g_llvm_executor = ExecutionEngine.new(g_llvm_module) + +# The binary operator precedence chart. +g_binop_precedence = {} + +################################################################################ +## Lexer +################################################################################ + +# The lexer yields one of these types for each token. +class EOFToken(object): pass +class DefToken(object): pass +class ExternToken(object): pass +class IfToken(object): pass +class ThenToken(object): pass +class ElseToken(object): pass +class ForToken(object): pass +class InToken(object): pass +class BinaryToken(object): pass +class UnaryToken(object): pass + +class IdentifierToken(object): + def __init__(self, name): self.name = name + +class NumberToken(object): + def __init__(self, value): self.value = value + +class CharacterToken(object): + def __init__(self, char): self.char = char + def __eq__(self, other): + return isinstance(other, CharacterToken) and self.char == other.char + def __ne__(self, other): return not self == other + +# Regular expressions that tokens and comments of our language. +REGEX_NUMBER = re.compile('[0-9]+(?:\.[0-9]+)?') +REGEX_IDENTIFIER = re.compile('[a-zA-Z][a-zA-Z0-9]*') +REGEX_COMMENT = re.compile('#.*') + +def Tokenize(string): + while string: + # Skip whitespace. + if string[0].isspace(): + string = string[1:] + continue + + # Run regexes. + comment_match = REGEX_COMMENT.match(string) + number_match = REGEX_NUMBER.match(string) + identifier_match = REGEX_IDENTIFIER.match(string) + + # Check if any of the regexes matched and yield the appropriate result. + if comment_match: + comment = comment_match.group(0) + string = string[len(comment):] + elif number_match: + number = number_match.group(0) + yield NumberToken(float(number)) + string = string[len(number):] + elif identifier_match: + identifier = identifier_match.group(0) + # Check if we matched a keyword. + if identifier == 'def': + yield DefToken() + elif identifier == 'extern': + yield ExternToken() + elif identifier == 'if': + yield IfToken() + elif identifier == 'then': + yield ThenToken() + elif identifier == 'else': + yield ElseToken() + elif identifier == 'for': + yield ForToken() + elif identifier == 'in': + yield InToken() + elif identifier == 'binary': + yield BinaryToken() + elif identifier == 'unary': + yield UnaryToken() + else: + yield IdentifierToken(identifier) + string = string[len(identifier):] + else: + # Yield the ASCII value of the unknown character. + yield CharacterToken(string[0]) + string = string[1:] + + yield EOFToken() + +################################################################################ +## Abstract Syntax Tree (aka Parse Tree) +################################################################################ + +# Base class for all expression nodes. +class ExpressionNode(object): + pass + +# Expression class for numeric literals like "1.0". +class NumberExpressionNode(ExpressionNode): + + def __init__(self, value): + self.value = value + + def CodeGen(self): + return Constant.real(Type.double(), self.value) + +# Expression class for referencing a variable, like "a". +class VariableExpressionNode(ExpressionNode): + + def __init__(self, name): + self.name = name + + def CodeGen(self): + if self.name in g_named_values: + return g_named_values[self.name] + else: + raise RuntimeError('Unknown variable name: ' + self.name) + +# Expression class for a binary operator. +class BinaryOperatorExpressionNode(ExpressionNode): + + def __init__(self, operator, left, right): + self.operator = operator + self.left = left + self.right = right + + def CodeGen(self): + left = self.left.CodeGen() + right = self.right.CodeGen() + + if self.operator == '+': + return g_llvm_builder.fadd(left, right, 'addtmp') + elif self.operator == '-': + return g_llvm_builder.fsub(left, right, 'subtmp') + elif self.operator == '*': + return g_llvm_builder.fmul(left, right, 'multmp') + elif self.operator == '<': + result = g_llvm_builder.fcmp(FCMP_ULT, left, right, 'cmptmp') + # Convert bool 0 or 1 to double 0.0 or 1.0. + return g_llvm_builder.uitofp(result, Type.double(), 'booltmp') + else: + function = g_llvm_module.get_function_named('binary' + self.operator) + return g_llvm_builder.call(function, [left, right], 'binop') + +# Expression class for function calls. +class CallExpressionNode(ExpressionNode): + + def __init__(self, callee, args): + self.callee = callee + self.args = args + + def CodeGen(self): + # Look up the name in the global module table. + callee = g_llvm_module.get_function_named(self.callee) + + # Check for argument mismatch error. + if len(callee.args) != len(self.args): + raise RuntimeError('Incorrect number of arguments passed.') + + arg_values = [i.CodeGen() for i in self.args] + + return g_llvm_builder.call(callee, arg_values, 'calltmp') + +# Expression class for if/then/else. +class IfExpressionNode(ExpressionNode): + + def __init__(self, condition, then_branch, else_branch): + self.condition = condition + self.then_branch = then_branch + self.else_branch = else_branch + + def CodeGen(self): + condition = self.condition.CodeGen() + + # Convert condition to a bool by comparing equal to 0.0. + condition_bool = g_llvm_builder.fcmp( + FCMP_ONE, condition, Constant.real(Type.double(), 0), 'ifcond') + + function = g_llvm_builder.basic_block.function + + # Create blocks for the then and else cases. Insert the 'then' block at the + # end of the function. + then_block = function.append_basic_block('then') + else_block = function.append_basic_block('else') + merge_block = function.append_basic_block('ifcond') + + g_llvm_builder.cbranch(condition_bool, then_block, else_block) + + # Emit then value. + g_llvm_builder.position_at_end(then_block) + then_value = self.then_branch.CodeGen() + g_llvm_builder.branch(merge_block) + + # Codegen of 'Then' can change the current block; update then_block for the + # PHI node. + then_block = g_llvm_builder.basic_block + + # Emit else block. + g_llvm_builder.position_at_end(else_block) + else_value = self.else_branch.CodeGen() + g_llvm_builder.branch(merge_block) + + # Codegen of 'Else' can change the current block, update else_block for the + # PHI node. + else_block = g_llvm_builder.basic_block + + # Emit merge block. + g_llvm_builder.position_at_end(merge_block) + phi = g_llvm_builder.phi(Type.double(), 'iftmp') + phi.add_incoming(then_value, then_block) + phi.add_incoming(else_value, else_block) + + return phi + +# Expression class for for/in. +class ForExpressionNode(ExpressionNode): + + def __init__(self, loop_variable, start, end, step, body): + self.loop_variable = loop_variable + self.start = start + self.end = end + self.step = step + self.body = body + + def CodeGen(self): + # Output this as: + # ... + # start = startexpr + # goto loop + # loop: + # variable = phi [start, loopheader], [nextvariable, loopend] + # ... + # bodyexpr + # ... + # loopend: + # step = stepexpr + # nextvariable = variable + step + # endcond = endexpr + # br endcond, loop, endloop + # outloop: + + # Emit the start code first, without 'variable' in scope. + start_value = self.start.CodeGen() + + # Make the new basic block for the loop header, inserting after current + # block. + function = g_llvm_builder.basic_block.function + pre_header_block = g_llvm_builder.basic_block + loop_block = function.append_basic_block('loop') + + # Insert an explicit fallthrough from the current block to the loop_block. + g_llvm_builder.branch(loop_block) + + # Start insertion in loop_block. + g_llvm_builder.position_at_end(loop_block) + + # Start the PHI node with an entry for start. + variable_phi = g_llvm_builder.phi(Type.double(), self.loop_variable) + variable_phi.add_incoming(start_value, pre_header_block) + + # Within the loop, the variable is defined equal to the PHI node. If it + # shadows an existing variable, we have to restore it, so save it now. + old_value = g_named_values.get(self.loop_variable, None) + g_named_values[self.loop_variable] = variable_phi + + # Emit the body of the loop. This, like any other expr, can change the + # current BB. Note that we ignore the value computed by the body. + self.body.CodeGen() + + # Emit the step value. + if self.step: + step_value = self.step.CodeGen() + else: + # If not specified, use 1.0. + step_value = Constant.real(Type.double(), 1) + + next_value = g_llvm_builder.fadd(variable_phi, step_value, 'next') + + # Compute the end condition and convert it to a bool by comparing to 0.0. + end_condition = self.end.CodeGen() + end_condition_bool = g_llvm_builder.fcmp( + FCMP_ONE, end_condition, Constant.real(Type.double(), 0), 'loopcond') + + # Create the "after loop" block and insert it. + loop_end_block = g_llvm_builder.basic_block + after_block = function.append_basic_block('afterloop') + + # Insert the conditional branch into the end of loop_end_block. + g_llvm_builder.cbranch(end_condition_bool, loop_block, after_block) + + # Any new code will be inserted in after_block. + g_llvm_builder.position_at_end(after_block) + + # Add a new entry to the PHI node for the backedge. + variable_phi.add_incoming(next_value, loop_end_block) + + # Restore the unshadowed variable. + if old_value: + g_named_values[self.loop_variable] = old_value + else: + del g_named_values[self.loop_variable] + + # for expr always returns 0.0. + return Constant.real(Type.double(), 0) + +# Expression class for a unary operator. +class UnaryExpressionNode(ExpressionNode): + + def __init__(self, operator, operand): + self.operator = operator + self.operand = operand + + def CodeGen(self): + operand = self.operand.CodeGen() + function = g_llvm_module.get_function_named('unary' + self.operator) + return g_llvm_builder.call(function, [operand], 'unop') + +# This class represents the "prototype" for a function, which captures its name, +# and its argument names (thus implicitly the number of arguments the function +# takes), as well as if it is an operator. +class PrototypeNode(object): + + def __init__(self, name, args, is_operator=False, precedence=0): + self.name = name + self.args = args + self.is_operator = is_operator + self.precedence = precedence + + def IsBinaryOp(self): + return self.is_operator and len(self.args) == 2 + + def GetOperatorName(self): + assert self.is_operator + return self.name[-1] + + def CodeGen(self): + # Make the function type, eg. double(double,double). + funct_type = Type.function( + Type.double(), [Type.double()] * len(self.args), False) + + function = Function.new(g_llvm_module, funct_type, self.name) + + # If the name conflicted, there was already something with the same name. + # If it has a body, don't allow redefinition or reextern. + if function.name != self.name: + function.delete() + function = g_llvm_module.get_function_named(self.name) + + # If the function already has a body, reject this. + if not function.is_declaration: + raise RuntimeError('Redefinition of function.') + + # If the function took a different number of args, reject. + if len(function.args) != len(self.args): + raise RuntimeError('Redeclaration of a function with different number ' + 'of args.') + + # Set names for all arguments and add them to the variables symbol table. + for arg, arg_name in zip(function.args, self.args): + arg.name = arg_name + # Add arguments to variable symbol table. + g_named_values[arg_name] = arg + + return function + +# This class represents a function definition itself. +class FunctionNode(object): + + def __init__(self, prototype, body): + self.prototype = prototype + self.body = body + + def CodeGen(self): + # Clear scope. + g_named_values.clear() + + # Create a function object. + function = self.prototype.CodeGen() + + # If this is a binary operator, install its precedence. + if self.prototype.IsBinaryOp(): + operator = self.prototype.GetOperatorName() + g_binop_precedence[operator] = self.prototype.precedence + + # Create a new basic block to start insertion into. + block = function.append_basic_block('entry') + global g_llvm_builder + g_llvm_builder = Builder.new(block) + + # Finish off the function. + try: + return_value = self.body.CodeGen() + g_llvm_builder.ret(return_value) + + # Validate the generated code, checking for consistency. + function.verify() + + # Optimize the function. + g_llvm_pass_manager.run(function) + except: + function.delete() + if self.prototype.IsBinaryOp(): + del g_binop_precedence[self.prototype.GetOperatorName()] + raise + + return function + + +################################################################################ +## Parser +################################################################################ + +class Parser(object): + + def __init__(self, tokens): + self.tokens = tokens + self.Next() + + # Provide a simple token buffer. Parser.current is the current token the + # parser is looking at. Parser.Next() reads another token from the lexer and + # updates Parser.current with its results. + def Next(self): + self.current = self.tokens.next() + + # Gets the precedence of the current token, or -1 if the token is not a binary + # operator. + def GetCurrentTokenPrecedence(self): + if isinstance(self.current, CharacterToken): + return g_binop_precedence.get(self.current.char, -1) + else: + return -1 + + # identifierexpr ::= identifier | identifier '(' expression* ')' + def ParseIdentifierExpr(self): + identifier_name = self.current.name + self.Next() # eat identifier. + + if self.current != CharacterToken('('): # Simple variable reference. + return VariableExpressionNode(identifier_name) + + # Call. + self.Next() # eat '('. + args = [] + if self.current != CharacterToken(')'): + while True: + args.append(self.ParseExpression()) + if self.current == CharacterToken(')'): + break + elif self.current != CharacterToken(','): + raise RuntimeError('Expected ")" or "," in argument list.') + self.Next() + + self.Next() # eat ')'. + return CallExpressionNode(identifier_name, args) + + # numberexpr ::= number + def ParseNumberExpr(self): + result = NumberExpressionNode(self.current.value) + self.Next() # consume the number. + return result + + # parenexpr ::= '(' expression ')' + def ParseParenExpr(self): + self.Next() # eat '('. + + contents = self.ParseExpression() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")".') + self.Next() # eat ')'. + + return contents + + # ifexpr ::= 'if' expression 'then' expression 'else' expression + def ParseIfExpr(self): + self.Next() # eat the if. + + # condition. + condition = self.ParseExpression() + + if not isinstance(self.current, ThenToken): + raise RuntimeError('Expected "then".') + self.Next() # eat the then. + + then_branch = self.ParseExpression() + + if not isinstance(self.current, ElseToken): + raise RuntimeError('Expected "else".') + self.Next() # eat the else. + + else_branch = self.ParseExpression() + + return IfExpressionNode(condition, then_branch, else_branch) + + # forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression + def ParseForExpr(self): + self.Next() # eat the for. + + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected identifier after for.') + + loop_variable = self.current.name + self.Next() # eat the identifier. + + if self.current != CharacterToken('='): + raise RuntimeError('Expected "=" after for variable.') + self.Next() # eat the '='. + + start = self.ParseExpression() + + if self.current != CharacterToken(','): + raise RuntimeError('Expected "," after for start value.') + self.Next() # eat the ','. + + end = self.ParseExpression() + + # The step value is optional. + if self.current == CharacterToken(','): + self.Next() # eat the ','. + step = self.ParseExpression() + else: + step = None + + if not isinstance(self.current, InToken): + raise RuntimeError('Expected "in" after for variable specification.') + self.Next() # eat 'in'. + + body = self.ParseExpression() + + return ForExpressionNode(loop_variable, start, end, step, body) + + # primary ::= identifierexpr | numberexpr | parenexpr | ifexpr | forexpr + def ParsePrimary(self): + if isinstance(self.current, IdentifierToken): + return self.ParseIdentifierExpr() + elif isinstance(self.current, NumberToken): + return self.ParseNumberExpr() + elif isinstance(self.current, IfToken): + return self.ParseIfExpr() + elif isinstance(self.current, ForToken): + return self.ParseForExpr() + elif self.current == CharacterToken('('): + return self.ParseParenExpr() + else: + raise RuntimeError('Unknown token when expecting an expression.') + + # unary ::= primary | unary_operator unary + def ParseUnary(self): + # If the current token is not an operator, it must be a primary expression. + if (not isinstance(self.current, CharacterToken) or + self.current in [CharacterToken('('), CharacterToken(',')]): + return self.ParsePrimary() + + # If this is a unary operator, read it. + operator = self.current.char + self.Next() # eat the operator. + return UnaryExpressionNode(operator, self.ParseUnary()) + + # binoprhs ::= (binary_operator unary)* + def ParseBinOpRHS(self, left, left_precedence): + # If this is a binary operator, find its precedence. + while True: + precedence = self.GetCurrentTokenPrecedence() + + # If this is a binary operator that binds at least as tightly as the + # current one, consume it; otherwise we are done. + if precedence < left_precedence: + return left + + binary_operator = self.current.char + self.Next() # eat the operator. + + # Parse the unary expression after the binary operator. + right = self.ParseUnary() + + # If binary_operator binds less tightly with right than the operator after + # right, let the pending operator take right as its left. + next_precedence = self.GetCurrentTokenPrecedence() + if precedence < next_precedence: + right = self.ParseBinOpRHS(right, precedence + 1) + + # Merge left/right. + left = BinaryOperatorExpressionNode(binary_operator, left, right) + + # expression ::= unary binoprhs + def ParseExpression(self): + left = self.ParseUnary() + return self.ParseBinOpRHS(left, 0) + + # prototype + # ::= id '(' id* ')' + # ::= binary LETTER number? (id, id) + # ::= unary LETTER (id) + def ParsePrototype(self): + precedence = None + if isinstance(self.current, IdentifierToken): + kind = 'normal' + function_name = self.current.name + self.Next() # eat function name. + elif isinstance(self.current, UnaryToken): + kind = 'unary' + self.Next() # eat 'unary'. + if not isinstance(self.current, CharacterToken): + raise RuntimeError('Expected an operator after "unary".') + function_name = 'unary' + self.current.char + self.Next() # eat the operator. + elif isinstance(self.current, BinaryToken): + kind = 'binary' + self.Next() # eat 'binary'. + if not isinstance(self.current, CharacterToken): + raise RuntimeError('Expected an operator after "binary".') + function_name = 'binary' + self.current.char + self.Next() # eat the operator. + if isinstance(self.current, NumberToken): + if not 1 <= self.current.value <= 100: + raise RuntimeError('Invalid precedence: must be in range [1, 100].') + precedence = self.current.value + self.Next() # eat the precedence. + else: + raise RuntimeError('Expected function name, "unary" or "binary" in ' + 'prototype.') + + if self.current != CharacterToken('('): + raise RuntimeError('Expected "(" in prototype.') + self.Next() # eat '('. + + arg_names = [] + while isinstance(self.current, IdentifierToken): + arg_names.append(self.current.name) + self.Next() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")" in prototype.') + + # Success. + self.Next() # eat ')'. + + if kind == 'unary' and len(arg_names) != 1: + raise RuntimeError('Invalid number of arguments for a unary operator.') + elif kind == 'binary' and len(arg_names) != 2: + raise RuntimeError('Invalid number of arguments for a binary operator.') + + return PrototypeNode(function_name, arg_names, kind != 'normal', precedence) + + # definition ::= 'def' prototype expression + def ParseDefinition(self): + self.Next() # eat def. + proto = self.ParsePrototype() + body = self.ParseExpression() + return FunctionNode(proto, body) + + # toplevelexpr ::= expression + def ParseTopLevelExpr(self): + proto = PrototypeNode('', []) + return FunctionNode(proto, self.ParseExpression()) + + # external ::= 'extern' prototype + def ParseExtern(self): + self.Next() # eat extern. + return self.ParsePrototype() + + # Top-Level parsing + def HandleDefinition(self): + self.Handle(self.ParseDefinition, 'Read a function definition:') + + def HandleExtern(self): + self.Handle(self.ParseExtern, 'Read an extern:') + + def HandleTopLevelExpression(self): + try: + function = self.ParseTopLevelExpr().CodeGen() + result = g_llvm_executor.run_function(function, []) + print 'Evaluated to:', result.as_real(Type.double()) + except Exception, e: + print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + + def Handle(self, function, message): + try: + print message, function().CodeGen() + except Exception, e: + print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + +################################################################################ +## Main driver code. +################################################################################ + +def main(): + # Set up the optimizer pipeline. Start with registering info about how the + # target lays out data structures. + g_llvm_pass_manager.add(g_llvm_executor.target_data) + # Do simple "peephole" optimizations and bit-twiddling optzns. + g_llvm_pass_manager.add(PASS_INSTRUCTION_COMBINING) + # Reassociate expressions. + g_llvm_pass_manager.add(PASS_REASSOCIATE) + # Eliminate Common SubExpressions. + g_llvm_pass_manager.add(PASS_GVN) + # Simplify the control flow graph (deleting unreachable blocks, etc). + g_llvm_pass_manager.add(PASS_CFG_SIMPLIFICATION) + + g_llvm_pass_manager.initialize() + + # Install standard binary operators. + # 1 is lowest possible precedence. 40 is the highest. + g_binop_precedence['<'] = 10 + g_binop_precedence['+'] = 20 + g_binop_precedence['-'] = 20 + g_binop_precedence['*'] = 40 + + # Run the main "interpreter loop". + while True: + print 'ready>', + try: + raw = raw_input() + except KeyboardInterrupt: + break + + parser = Parser(Tokenize(raw)) + while True: + # top ::= definition | external | expression | EOF + if isinstance(parser.current, EOFToken): + break + if isinstance(parser.current, DefToken): + parser.HandleDefinition() + elif isinstance(parser.current, ExternToken): + parser.HandleExtern() + else: + parser.HandleTopLevelExpression() + + # Print out all of the generated code. + print '\n', g_llvm_module + +if __name__ == '__main__': + main() +{% endhighlight %} + + +* * * + +**[Next: Extending the language: mutable variables / SSA construction](PythonLangImpl7.html)** + + diff --git a/docs/source/doc/kaleidoscope/PythonLangImpl7.md b/docs/source/doc/kaleidoscope/PythonLangImpl7.md new file mode 100644 index 0000000..5603ee4 --- /dev/null +++ b/docs/source/doc/kaleidoscope/PythonLangImpl7.md @@ -0,0 +1,1794 @@ +--- +layout: page +title: "Kaleidoscope: Chapter 7" +--- + +# Extending the Language: Mutable Variables / SSA construction + +Written by [Chris Lattner](mailto:sabre@nondot.org) +and [Max Shawabkeh](http://max99x.com) + + +**Chapter 7** + + +* This will become a table of contents (this text will be scraped). +{:toc} + + +**[Chapter 8: Conclusion and other useful LLVM tidbits](PythonLangImpl8.html)** + + +# Introduction # {#intro} + +Welcome to Chapter 7 of the +[Implementing a language with LLVM](http://www.llvm.org/docs/tutorial/index.html) +tutorial. In chapters 1 through 6, we've built a very +respectable, albeit simple, +[functional programming language](http://en.wikipedia.org/wiki/Functional_programming). +In our journey, we learned some parsing techniques, +how to build and represent an AST, how to build LLVM IR, and how to optimize +the resultant code as well as JIT compile it. + +While Kaleidoscope is interesting as a functional language, the fact that it +is functional makes it "too easy" to generate LLVM IR for it. In particular, a +functional language makes it very easy to build LLVM IR directly in +[SSA form](http://en.wikipedia.org/wiki/Static_single_assignment_form). +Since LLVM requires that the input code be in SSA form, this is a very nice +property and it is often unclear to newcomers how to generate code for an +imperative language with mutable variables. + +The short (and happy) summary of this chapter is that there is no need for +your front-end to build SSA form: LLVM provides highly tuned and well tested +support for this, though the way it works is a bit unexpected for some. + + +# Why is this a hard problem? # {#why} + +To understand why mutable variables cause complexities in SSA construction, +consider this extremely simple C example: + + + +{% highlight python %} +int G, H; +int test(_Bool Condition) { + int X; + if (Condition) + X = G; + else + X = H; + return X; +} +{% endhighlight %} + + +In this case, we have the variable "X", whose value depends on the path +executed in the program. Because there are two different possible values for X +before the return instruction, a PHI node is inserted to merge the two values. +The LLVM IR that we want for this example looks like this: + + +{% highlight llvm %} +@G = weak global i32 0 ; type of @G is i32* +@H = weak global i32 0 ; type of @H is i32* +define i32 @test(i1 %Condition) { +entry: + br i1 %Condition, label %cond_true, label %cond_false +cond_true: + %X.0 = load i32* @G + br label %cond_next +cond_false: + %X.1 = load i32* @H + br label %cond_next +cond_next: + %X.2 = phi i32 [ %X.1, %cond_false ], [ %X.0, %cond_true ] + ret i32 %X.2 +} +{% endhighlight %} + + +In this example, the loads from the G and H global variables are explicit in +the LLVM IR, and they live in the then/else branches of the if statement +(cond_true/cond_false). In order to merge the incoming values, the X.2 phi node +in the cond_next block selects the right value to use based on where control +flow is coming from: if control flow comes from the cond_false block, X.2 gets +the value of X.1. Alternatively, if control flow comes from cond_true, it gets +the value of X.0. The intent of this chapter is not to explain the details of +SSA form. For more information, see one of the many +[online references](http://en.wikipedia.org/wiki/Static_single_assignment_form). + +The question for this article is "who places the phi nodes when lowering +assignments to mutable variables?". The issue here is that LLVM +*requires* that its IR be in SSA form: there is no "non-ssa" mode for it. +However, SSA construction requires non-trivial algorithms and data structures, +so it is inconvenient and wasteful for every front-end to have to reproduce this +logic. + +# Memory in LLVM # {#memory} + +The 'trick' here is that while LLVM does require all register values to be +in SSA form, it does not require (or permit) memory objects to be in SSA form. +In the example above, note that the loads from G and H are direct accesses to +G and H: they are not renamed or versioned. This differs from some other +compiler systems, which do try to version memory objects. In LLVM, instead of +encoding dataflow analysis of memory into the LLVM IR, it is handled with +[Analysis Passes](http://www.llvm.org/docs/WritingAnLLVMPass.html) +which are computed on demand. + + +With this in mind, the high-level idea is that we want to make a stack variable +(which lives in memory, because it is on the stack) for each mutable object in +a function. To take advantage of this trick, we need to talk about how LLVM +represents stack variables. + + +In LLVM, all memory accesses are explicit with load/store instructions, and +it is carefully designed not to have (or need) an "address-of" operator. +Notice how the type of the @G/@H global variables is actually "i32*" even though +the variable is defined as "i32". What this means is that @G defines +*space* for an i32 in the global data area, but its *name* +actually refers to the address for that space. Stack variables work the same +way, except that instead of being declared with global variable definitions, +they are declared with the +[LLVM alloca instruction](http://www.llvm.org/docs/LangRef.html#i_alloca): + + +{% highlight python %} +define i32 @example() { +entry: + %X = alloca i32 ; type of %X is i32*. + ... + %tmp = load i32* %X ; load the stack value %X from the stack. + %tmp2 = add i32 %tmp, 1 ; increment it + store i32 %tmp2, i32* %X ; store it back + ... +{% endhighlight %} + + +This code shows an example of how you can declare and manipulate a stack +variable in the LLVM IR. Stack memory allocated with the alloca instruction is +fully general: you can pass the address of the stack slot to functions, you can +store it in other variables, etc. In our example above, we could rewrite the +example to use the alloca technique to avoid using a PHI node: + + +{% highlight llvm %} +@G = weak global i32 0 ; type of @G is i32* +@H = weak global i32 0 ; type of @H is i32* +define i32 @test(i1 %Condition) { +entry: + %X = alloca i32 ; type of %X is i32*. + br i1 %Condition, label %cond_true, label %cond_false +cond_true: + %X.0 = load i32* @G + store i32 %X.0, i32* %X ; Update X + br label %cond_next +cond_false: + %X.1 = load i32* @H + store i32 %X.1, i32* %X ; Update X + br label %cond_next +cond_next: + %X.2 = load i32* %X ; Read X + ret i32 %X.2 +} +{% endhighlight %} + + +With this, we have discovered a way to handle arbitrary mutable variables +without the need to create Phi nodes at all: + +
    +
  1. Each mutable variable becomes a stack allocation.
  2. +
  3. Each read of the variable becomes a load from the stack.
  4. +
  5. Each update of the variable becomes a store to the stack.
  6. +
  7. Taking the address of a variable just uses the stack address directly.
  8. +
+ +While this solution has solved our immediate problem, it introduced another +one: we have now apparently introduced a lot of stack traffic for very simple +and common operations, a major performance problem. Fortunately for us, the +LLVM optimizer has a highly-tuned optimization pass named "mem2reg" that handles +this case, promoting allocas like this into SSA registers, inserting Phi nodes +as appropriate. If you run this example through the pass, for example, you'll +get: + +{% highlight bash %} +$ llvm-as < example.ll | opt -mem2reg | llvm-dis +{% endhighlight %} + +{% highlight llvm %} +@G = weak global i32 0 +@H = weak global i32 0 +define i32 @test(i1 %Condition) { +entry: + br i1 %Condition, label %cond_true, label %cond_false +cond_true: + %X.0 = load i32* @G + br label %cond_next +cond_false: + %X.1 = load i32* @H + br label %cond_next +cond_next: + %X.01 = phi i32 [ %X.1, %cond_false ], [ %X.0, %cond_true ] + ret i32 %X.01 +} +{% endhighlight %} + +The mem2reg pass implements the standard "iterated dominance frontier" +algorithm for constructing SSA form and has a number of optimizations that speed +up (very common) degenerate cases. The mem2reg optimization pass is the answer +to dealing with mutable variables, and we highly recommend that you depend on +it. Note that mem2reg only works on variables in certain circumstances: + +* mem2reg is alloca-driven: it looks for allocas and if it can handle them, it + promotes them. It does not apply to global variables or heap allocations. + +* mem2reg only looks for alloca instructions in the entry block of the + function. Being in the entry block guarantees that the alloca is only + executed once, which makes analysis simpler. + +* mem2reg only promotes allocas whose uses are direct loads and stores. If + the address of the stack object is passed to a function, or if any funny + pointer arithmetic is involved, the alloca will not be promoted. + +* mem2reg only works on allocas of + [first class](http://www.llvm.org/docs/LangRef.html#t_classifications) + values (such as pointers, scalars and vectors), and only if the array size + of the allocation is 1 (or missing in the .ll file). mem2reg is not capable + of promoting structs or arrays to registers. Note that the "scalarrepl" pass + is more powerful and can promote structs, "unions", and arrays in many cases. + + +All of these properties are easy to satisfy for most imperative languages, and +we'll illustrate it below with Kaleidoscope. The final question you may be +asking is: should I bother with this nonsense for my front-end? Wouldn't it be +better if I just did SSA construction directly, avoiding use of the mem2reg +optimization pass? In short, we strongly recommend that you use this technique +for building SSA form, unless there is an extremely good reason not to. Using +this technique is: + +* Proven and well tested: llvm-gcc and clang both use this technique for local + mutable variables. As such, the most common clients of LLVM are using this to + handle a bulk of their variables. You can be sure that bugs are found fast + and fixed early. + +* Extremely Fast: mem2reg has a number of special cases that make it fast in + common cases as well as fully general. For example, it has fast-paths for + variables that are only used in a single block, variables that only have one + assignment point, good heuristics to avoid insertion of unneeded phi nodes, etc. + + +* Needed for debug info generation: + [Debug information in LLVM](http://www.llvm.org/docs/SourceLevelDebugging.html) + relies on having the address of the variable exposed so that debug + info can be attached to it. This technique dovetails very naturally with this + style of debug info. + + +If nothing else, this makes it much easier to get your front-end up and +running, and is very simple to implement. Lets extend Kaleidoscope with mutable +variables now! + + +* * * + +# Mutable Variables in Kaleidoscope # {#kalvars} + +Now that we know the sort of problem we want to tackle, lets see what this +looks like in the context of our little Kaleidoscope language. We're going to +add two features: + +* The ability to mutate variables with the '=' operator. +* The ability to define new variables. + +While the first item is really what this is about, we only have variables +for incoming arguments as well as for induction variables, and redefining those +only goes so far :). Also, the ability to define new variables is a +useful thing regardless of whether you will be mutating them. Here's a +motivating example that shows how we could use these: + + +{% highlight python %} +# Define ':' for sequencing: as a low-precedence operator that ignores operands +# and just returns the RHS. +def binary : 1 (x y) y; + +# Recursive fib, we could do this before. +def fib(x) + if (x < 3) then + 1 + else + fib(x-1) + fib(x-2) + +# Iterative fib. +def fibi(x) + var a = 1, b = 1, c in + (for i = 3, i < x in + c = a + b : + a = b : + b = c) : + b + +# Call it. +fibi(10) +{% endhighlight %} + + + +In order to mutate variables, we have to change our existing variables to use +the "alloca trick". Once we have that, we'll add our new operator, then extend +Kaleidoscope to support new variable definitions. + + +* * * + +# Adjusting Existing Variables for Mutation # {#adjustments} + +The symbol table in Kaleidoscope is managed at code generation time by the +`g_named_values` map. This map currently keeps track of the LLVM +"Value" that holds the double value for the named variable. In order to support +mutation, we need to change this slightly, so that it holds the *memory +location* of the variable in question. Note that this change is a +refactoring: it changes the structure of the code, but does not (by itself) +change the behavior of the compiler. All of these changes are isolated in the +Kaleidoscope code generator. + + +At this point in Kaleidoscope's development, it only supports variables for two +things: incoming arguments to functions and the induction variable of 'for' +loops. For consistency, we'll allow mutation of these variables in addition to +other user-defined variables. This means that these will both need memory +locations. + + +To start our transformation of Kaleidoscope, we will need to create the +allocas that we will store in `g_named_values`. We'll use a helper +function that ensures that the allocas are created in the entry block of the +function: + + +{% highlight python %} +# Creates an alloca instruction in the entry block of the function. This is used +# for mutable variables. +def CreateEntryBlockAlloca(function, var_name): + entry = function.get_entry_basic_block() + builder = Builder.new(entry) + builder.position_at_beginning(entry) + return builder.alloca(Type.double(), var_name) +{% endhighlight %} + + +This code creates a temporary `llvm.core.Builder` that is pointing at +the first instruction of the entry block. It then creates an alloca with the +expected name and returns it. Because all values in Kaleidoscope are doubles, +there is no need to pass in a type to use. + +With this in place, the first functionality change we want to make is to +variable references. In our new scheme, variables live on the stack, so code +generating a reference to them actually needs to produce a load from the stack +slot: + + +{% highlight python %} + def CodeGen(self): + if self.name in g_named_values: + return g_llvm_builder.load(g_named_values[self.name], self.name) + else: + raise RuntimeError('Unknown variable name: ' + self.name) +{% endhighlight %} + + +As you can see, this is pretty straightforward. Now we need to update the +things that define the variables to set up the alloca. We'll start with +`ForExpressionNode.CodeGen` (see the [full code listing](#code) for the +unabridged code): + + +{% highlight python %} + def CodeGen(self): + function = g_llvm_builder.basic_block.function + + # Create an alloca for the variable in the entry block. + alloca = CreateEntryBlockAlloca(function, self.loop_variable) + + # Emit the start code first, without 'variable' in scope. + start_value = self.start.CodeGen() + + # Store the value into the alloca. + g_llvm_builder.store(start_value, alloca) + ... + # Compute the end condition. + end_condition = self.end.CodeGen() + + # Reload, increment, and restore the alloca. This handles the case where + # the body of the loop mutates the variable. + cur_value = g_llvm_builder.load(alloca, self.loop_variable) + next_value = g_llvm_builder.fadd(cur_value, step_value, 'nextvar') + g_llvm_builder.store(next_value, alloca) + + # Convert condition to a bool by comparing equal to 0.0. + end_condition_bool = g_llvm_builder.fcmp( + FCMP_ONE, end_condition, Constant.real(Type.double(), 0), 'loopcond') + ... +{% endhighlight %} + + +This code is virtually identical to the code +[before we allowed mutable variables](PythonLangImpl5.html#forcodegen). +The big difference is that we no longer have to construct a PHI node, and we use +load/store to access the variable as needed. + +To support mutable argument variables, we need to also make allocas for them. +The code for this is also pretty simple: + + +{% highlight python %} +class PrototypeNode(object): + ... + # Create an alloca for each argument and register the argument in the symbol + # table so that references to it will succeed. + def CreateArgumentAllocas(self, function): + for arg_name, arg in zip(self.args, function.args): + alloca = CreateEntryBlockAlloca(function, arg_name) + g_llvm_builder.store(arg, alloca) + g_named_values[arg_name] = alloca +{% endhighlight %} + + +For each argument, we make an alloca, store the input value to the function +into the alloca, and register the alloca as the memory location for the +argument. This method gets invoked by `FunctionNode.CodeGen` right after +it sets up the entry block for the function. + +The final missing piece is adding the mem2reg pass, which allows us to get +good codegen once again: + + +{% highlight python %} +from llvm.passes import (PASS_PROMOTE_MEMORY_TO_REGISTER, + PASS_INSTRUCTION_COMBINING, + PASS_REASSOCIATE, + PASS_GVN, + PASS_CFG_SIMPLIFICATION) +... +def main(): + # Set up the optimizer pipeline. Start with registering info about how the + # target lays out data structures. + g_llvm_pass_manager.add(g_llvm_executor.target_data) + # Promote allocas to registers. + g_llvm_pass_manager.add(PASS_PROMOTE_MEMORY_TO_REGISTER) + # Do simple "peephole" optimizations and bit-twiddling optzns. + g_llvm_pass_manager.add(PASS_INSTRUCTION_COMBINING) + # Reassociate expressions. + g_llvm_pass_manager.add(PASS_REASSOCIATE) +{% endhighlight %} + + +It is interesting to see what the code looks like before and after the +mem2reg optimization runs. For example, this is the before/after code for our +recursive fib function. Before the optimization: + + +{% highlight llvm %} +define double @fib(double %x) { +entry: + %x1 = alloca double + store double %x, double* %x1 + %x2 = load double* %x1 + %cmptmp = fcmp ult double %x2, 3.000000e+00 + %booltmp = uitofp i1 %cmptmp to double + %ifcond = fcmp one double %booltmp, 0.000000e+00 + br i1 %ifcond, label %then, label %else +then: ; preds = %entry + br label %ifcont +else: ; preds = %entry + %x3 = load double* %x1 + %subtmp = fsub double %x3, 1.000000e+00 + %calltmp = call double @fib(double %subtmp) + %x4 = load double* %x1 + %subtmp5 = fsub double %x4, 2.000000e+00 + %calltmp6 = call double @fib(double %subtmp5) + %addtmp = fadd double %calltmp, %calltmp6 + br label %ifcont +ifcont: ; preds = %else, %then + %iftmp = phi double [ 1.000000e+00, %then ], [ %addtmp, %else ] + ret double %iftmp +} +{% endhighlight %} + + +Here there is only one variable (x, the input argument) but you can still +see the extremely simple-minded code generation strategy we are using. In the +entry block, an alloca is created, and the initial input value is stored into +it. Each reference to the variable does a reload from the stack. Also, note +that we didn't modify the if/then/else expression, so it still inserts a PHI +node. While we could make an alloca for it, it is actually easier to create a +PHI node for it, so we still just make the PHI. + +Here is the code after the mem2reg pass runs: + + +{% highlight llvm %} +define double @fib(double %x) { +entry: + %cmptmp = fcmp ult double %x, 3.000000e+00 + %booltmp = uitofp i1 %cmptmp to double + %ifcond = fcmp one double %booltmp, 0.000000e+00 + br i1 %ifcond, label %then, label %else +then: + br label %ifcont +else: + %subtmp = fsub double %x, 1.000000e+00 + %calltmp = call double @fib(double %subtmp) + %subtmp5 = fsub double %x, 2.000000e+00 + %calltmp6 = call double @fib(double %subtmp5) + %addtmp = fadd double %calltmp, %calltmp6 + br label %ifcont +ifcont: ; preds = %else, %then + %iftmp = phi double [ 1.000000e+00, %then ], [ %addtmp, %else ] + ret double %iftmp +} +{% endhighlight %} + + +This is a trivial case for mem2reg, since there are no redefinitions of the +variable. The point of showing this is to calm your tension about inserting +such blatent inefficiencies :). + +After the rest of the optimizers run, we get: + + +{% highlight llvm %} +define double @fib(double %x) { +entry: + %cmptmp = fcmp ult double %x, 3.000000e+00 + %booltmp = uitofp i1 %cmptmp to double + %ifcond = fcmp ueq double %booltmp, 0.000000e+00 + br i1 %ifcond, label %else, label %ifcont +else: + %subtmp = fsub double %x, 1.000000e+00 + %calltmp = call double @fib(double %subtmp) + %subtmp5 = fsub double %x, 2.000000e+00 + %calltmp6 = call double @fib(double %subtmp5) + %addtmp = fadd double %calltmp, %calltmp6 + ret double %addtmp +ifcont: + ret double 1.000000e+00 +} +{% endhighlight %} + + +Here we see that the simplifycfg pass decided to clone the return instruction +into the end of the 'else' block. This allowed it to eliminate some branches +and the PHI node. + +Now that all symbol table references are updated to use stack variables, +we'll add the assignment operator. + + +* * * + +# New Assignment Operator # {#assignment} + +With our current framework, adding a new assignment operator is really +simple. We will parse it just like any other binary operator, but handle it +internally (instead of allowing the user to define it). The first step is to +set a precedence: + + +{% highlight python %} +def main(): + ... + # Install standard binary operators. + # 1 is lowest possible precedence. 40 is the highest. + g_binop_precedence['='] = 2 + g_binop_precedence['<'] = 10 + g_binop_precedence['+'] = 20 + g_binop_precedence['-'] = 20 +{% endhighlight %} + + +Now that the parser knows the precedence of the binary operator, it takes +care of all the parsing and AST generation. We just need to implement codegen +for the assignment operator. This looks like: + + +{% highlight python %} +class BinaryOperatorExpressionNode(ExpressionNode): + ... + def CodeGen(self): + # A special case for '=' because we don't want to emit the LHS as an + # expression. + if self.operator == '=': + # Assignment requires the LHS to be an identifier. + if not isinstance(self.left, VariableExpressionNode): + raise RuntimeError('Destination of "=" must be a variable.') +{% endhighlight %} + + +Unlike the rest of the binary operators, our assignment operator doesn't +follow the "emit LHS, emit RHS, do computation" model. As such, it is handled +as a special case before the other binary operators are handled. The other +strange thing is that it requires the LHS to be a variable. It is invalid to +have `(x+1) = expr` -- only things like `x = expr` are allowed. + + +{% highlight python %} + # Codegen the RHS. + value = self.right.CodeGen() + + # Look up the name. + variable = g_named_values[self.left.name] + + # Store the value and return it. + g_llvm_builder.store(value, variable) + + return value + ... +{% endhighlight %} + + +Once we have the variable, CodeGening the assignment is straightforward: +we emit the RHS of the assignment, create a store, and return the computed +value. Returning a value allows for chained assignments like `X = (Y = Z)`. + +Now that we have an assignment operator, we can mutate loop variables and +arguments. For example, we can now run code like this: + + +{% highlight python %} +# Function to print a double. +extern printd(x) + +# Define ':' for sequencing: as a low-precedence operator that ignores operands +# and just returns the RHS. +def binary : 1 (x y) y + +def test(x) + printd(x) : + x = 4 : + printd(x) + +test(123) +{% endhighlight %} + + +When run, this example prints "123" and then "4", showing that we did +actually mutate the value! Okay, we have now officially implemented our goal: +getting this to work requires SSA construction in the general case. However, +to be really useful, we want the ability to define our own local variables. +Let's add this next! + + +* * * + +# User-defined Local Variables # {#localvars} + +Adding var/in is just like any other other extensions we made to +Kaleidoscope: we extend the lexer, the parser, the AST and the code generator. +The first step for adding our new 'var/in' construct is to extend the lexer. +As before, this is pretty trivial, the code looks like this: + + +{% highlight python %} +... +class UnaryToken(object): pass +class VarToken(object): pass +... +def Tokenize(string): + ... + elif identifier == 'unary': + yield UnaryToken() + elif identifier == 'var': + yield VarToken() + else: + yield IdentifierToken(identifier) +{% endhighlight %} + + +The next step is to define the AST node that we will construct. For var/in, +it looks like this: + + +{% highlight python %} +# Expression class for var/in. +class VarExpressionNode(ExpressionNode): + + def __init__(self, variables, body): + self.variables = variables + self.body = body + + def CodeGen(self): + ... +{% endhighlight %} + + +var/in allows a list of names to be defined all at once, and each name can +optionally have an initializer value. As such, we capture this information in +the variables list. Also, var/in has a body, this body is allowed to access +the variables defined by the var/in. + +With this in place, we can define the parser pieces. The first thing we do +is add it as a primary expression: + + +{% highlight python %} + # primary ::= + # dentifierexpr | numberexpr | parenexpr | ifexpr | forexpr | varexpr + def ParsePrimary(self): + if isinstance(self.current, IdentifierToken): + return self.ParseIdentifierExpr() + elif isinstance(self.current, NumberToken): + return self.ParseNumberExpr() + elif isinstance(self.current, IfToken): + return self.ParseIfExpr() + elif isinstance(self.current, ForToken): + return self.ParseForExpr() + elif isinstance(self.current, VarToken): + return self.ParseVarExpr() + elif self.current == CharacterToken('('): + return self.ParseParenExpr() + else: + raise RuntimeError('Unknown token when expecting an expression.') +{% endhighlight %} + + +Next we define ParseVarExpr: + + +{% highlight python %} + # varexpr ::= 'var' (identifier ('=' expression)?)+ 'in' expression + def ParseVarExpr(self): + self.Next() # eat 'var'. + + variables = {} + + # At least one variable name is required. + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected identifier after "var".') +{% endhighlight %} + + +The first part of this code parses the list of identifier/expr pairs into the +local `variables` list. + + +{% highlight python %} + while True: + var_name = self.current.name + self.Next() # eat the identifier. + + # Read the optional initializer. + if self.current == CharacterToken('='): + self.Next() # eat '='. + variables[var_name] = self.ParseExpression() + else: + variables[var_name] = None + + # End of var list, exit loop. + if self.current != CharacterToken(','): + break + self.Next() # eat ','. + + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected identifier after "," in a var expression.') +{% endhighlight %} + + +Once all the variables are parsed, we then parse the body and create the +AST node: + + +{% highlight python %} + # At this point, we have to have 'in'. + if not isinstance(self.current, InToken): + raise RuntimeError('Expected "in" keyword after "var".') + self.Next() # eat 'in'. + + body = self.ParseExpression() + + return VarExpressionNode(variables, body) +{% endhighlight %} + + +Now that we can parse and represent the code, we need to support emission of +LLVM IR for it. This code starts out with: + + +{% highlight python %} +class VarExpressionNode(ExpressionNode): + ... + def CodeGen(self): + old_bindings = {} + function = g_llvm_builder.basic_block.function + + # Register all variables and emit their initializer. + for var_name, var_expression in self.variables.iteritems(): + # Emit the initializer before adding the variable to scope, this prevents + # the initializer from referencing the variable itself, and permits stuff + # like this: + # var a = 1 in + # var a = a in ... # refers to outer 'a'. + if var_expression is not None: + var_value = var_expression.CodeGen() + else: + var_value = Constant.real(Type.double(), 0) + + alloca = CreateEntryBlockAlloca(function, var_name) + g_llvm_builder.store(var_value, alloca) + + # Remember the old variable binding so that we can restore the binding + # when we unrecurse. + old_bindings[var_name] = g_named_values.get(var_name, None) + + # Remember this binding. + g_named_values[var_name] = alloca +{% endhighlight %} + + +Basically it loops over all the variables, installing them one at a time. +For each variable we put into the symbol table, we remember the previous value +that we replace in `old_bindings`. + +There are more comments here than code. The basic idea is that we emit the +initializer, create the alloca, then update the symbol table to point to it. +Once all the variables are installed in the symbol table, we evaluate the body +of the var/in expression: + + +{% highlight python %} + # Codegen the body, now that all vars are in scope. + body = self.body.CodeGen() +{% endhighlight %} + + +Finally, before returning, we restore the previous variable bindings: + + +{% highlight python %} + # Pop all our variables from scope. + for var_name in self.variables: + if old_bindings[var_name] is not None: + g_named_values[var_name] = old_bindings[var_name] + else: + del g_named_values[var_name] + + # Return the body computation. + return body +{% endhighlight %} + + +The end result of all of this is that we get properly scoped variable +definitions, and we even (trivially) allow mutation of them :). + +With this, we completed what we set out to do. Our nice iterative fib +example from the intro compiles and runs just fine. The mem2reg pass optimizes +all of our stack variables into SSA registers, inserting PHI nodes where needed, +and our front-end remains simple: no "iterated dominance frontier" computation +anywhere in sight. + +* * * + +# Full Code Listing # {#code} + +Here is the complete code listing for our running example, enhanced with mutable +variables and var/in support: + + +{% highlight python %} +#!/usr/bin/env python + +import re +from llvm.core import Module, Constant, Type, Function, Builder +from llvm.ee import ExecutionEngine, TargetData +from llvm.passes import FunctionPassManager + +from llvm.core import FCMP_ULT, FCMP_ONE +from llvm.passes import (PASS_PROMOTE_MEMORY_TO_REGISTER, + PASS_INSTRUCTION_COMBINING, + PASS_REASSOCIATE, + PASS_GVN, + PASS_CFG_SIMPLIFICATION) + +################################################################################ +## Globals +################################################################################ + +# The LLVM module, which holds all the IR code. +g_llvm_module = Module.new('my cool jit') + +# The LLVM instruction builder. Created whenever a new function is entered. +g_llvm_builder = None + +# A dictionary that keeps track of which values are defined in the current scope +# and what their LLVM representation is. +g_named_values = {} + +# The function optimization passes manager. +g_llvm_pass_manager = FunctionPassManager.new(g_llvm_module) + +# The LLVM execution engine. +g_llvm_executor = ExecutionEngine.new(g_llvm_module) + +# The binary operator precedence chart. +g_binop_precedence = {} + +# Creates an alloca instruction in the entry block of the function. This is used +# for mutable variables. +def CreateEntryBlockAlloca(function, var_name): + entry = function.get_entry_basic_block() + builder = Builder.new(entry) + builder.position_at_beginning(entry) + return builder.alloca(Type.double(), var_name) + +################################################################################ +## Lexer +################################################################################ + +# The lexer yields one of these types for each token. +class EOFToken(object): pass +class DefToken(object): pass +class ExternToken(object): pass +class IfToken(object): pass +class ThenToken(object): pass +class ElseToken(object): pass +class ForToken(object): pass +class InToken(object): pass +class BinaryToken(object): pass +class UnaryToken(object): pass +class VarToken(object): pass + +class IdentifierToken(object): + def __init__(self, name): self.name = name + +class NumberToken(object): + def __init__(self, value): self.value = value + +class CharacterToken(object): + def __init__(self, char): self.char = char + def __eq__(self, other): + return isinstance(other, CharacterToken) and self.char == other.char + def __ne__(self, other): return not self == other + +# Regular expressions that tokens and comments of our language. +REGEX_NUMBER = re.compile('[0-9]+(?:\.[0-9]+)?') +REGEX_IDENTIFIER = re.compile('[a-zA-Z][a-zA-Z0-9]*') +REGEX_COMMENT = re.compile('#.*') + +def Tokenize(string): + while string: + # Skip whitespace. + if string[0].isspace(): + string = string[1:] + continue + + # Run regexes. + comment_match = REGEX_COMMENT.match(string) + number_match = REGEX_NUMBER.match(string) + identifier_match = REGEX_IDENTIFIER.match(string) + + # Check if any of the regexes matched and yield the appropriate result. + if comment_match: + comment = comment_match.group(0) + string = string[len(comment):] + elif number_match: + number = number_match.group(0) + yield NumberToken(float(number)) + string = string[len(number):] + elif identifier_match: + identifier = identifier_match.group(0) + # Check if we matched a keyword. + if identifier == 'def': + yield DefToken() + elif identifier == 'extern': + yield ExternToken() + elif identifier == 'if': + yield IfToken() + elif identifier == 'then': + yield ThenToken() + elif identifier == 'else': + yield ElseToken() + elif identifier == 'for': + yield ForToken() + elif identifier == 'in': + yield InToken() + elif identifier == 'binary': + yield BinaryToken() + elif identifier == 'unary': + yield UnaryToken() + elif identifier == 'var': + yield VarToken() + else: + yield IdentifierToken(identifier) + string = string[len(identifier):] + else: + # Yield the ASCII value of the unknown character. + yield CharacterToken(string[0]) + string = string[1:] + + yield EOFToken() + +################################################################################ +## Abstract Syntax Tree (aka Parse Tree) +################################################################################ + +# Base class for all expression nodes. +class ExpressionNode(object): + pass + +# Expression class for numeric literals like "1.0". +class NumberExpressionNode(ExpressionNode): + + def __init__(self, value): + self.value = value + + def CodeGen(self): + return Constant.real(Type.double(), self.value) + +# Expression class for referencing a variable, like "a". +class VariableExpressionNode(ExpressionNode): + + def __init__(self, name): + self.name = name + + def CodeGen(self): + if self.name in g_named_values: + return g_llvm_builder.load(g_named_values[self.name], self.name) + else: + raise RuntimeError('Unknown variable name: ' + self.name) + +# Expression class for a binary operator. +class BinaryOperatorExpressionNode(ExpressionNode): + + def __init__(self, operator, left, right): + self.operator = operator + self.left = left + self.right = right + + def CodeGen(self): + # A special case for '=' because we don't want to emit the LHS as an + # expression. + if self.operator == '=': + # Assignment requires the LHS to be an identifier. + if not isinstance(self.left, VariableExpressionNode): + raise RuntimeError('Destination of "=" must be a variable.') + + # Codegen the RHS. + value = self.right.CodeGen() + + # Look up the name. + variable = g_named_values[self.left.name] + + # Store the value and return it. + g_llvm_builder.store(value, variable) + + return value + + left = self.left.CodeGen() + right = self.right.CodeGen() + + if self.operator == '+': + return g_llvm_builder.fadd(left, right, 'addtmp') + elif self.operator == '-': + return g_llvm_builder.fsub(left, right, 'subtmp') + elif self.operator == '*': + return g_llvm_builder.fmul(left, right, 'multmp') + elif self.operator == '<': + result = g_llvm_builder.fcmp(FCMP_ULT, left, right, 'cmptmp') + # Convert bool 0 or 1 to double 0.0 or 1.0. + return g_llvm_builder.uitofp(result, Type.double(), 'booltmp') + else: + function = g_llvm_module.get_function_named('binary' + self.operator) + return g_llvm_builder.call(function, [left, right], 'binop') + +# Expression class for function calls. +class CallExpressionNode(ExpressionNode): + + def __init__(self, callee, args): + self.callee = callee + self.args = args + + def CodeGen(self): + # Look up the name in the global module table. + callee = g_llvm_module.get_function_named(self.callee) + + # Check for argument mismatch error. + if len(callee.args) != len(self.args): + raise RuntimeError('Incorrect number of arguments passed.') + + arg_values = [i.CodeGen() for i in self.args] + + return g_llvm_builder.call(callee, arg_values, 'calltmp') + +# Expression class for if/then/else. +class IfExpressionNode(ExpressionNode): + + def __init__(self, condition, then_branch, else_branch): + self.condition = condition + self.then_branch = then_branch + self.else_branch = else_branch + + def CodeGen(self): + condition = self.condition.CodeGen() + + # Convert condition to a bool by comparing equal to 0.0. + condition_bool = g_llvm_builder.fcmp( + FCMP_ONE, condition, Constant.real(Type.double(), 0), 'ifcond') + + function = g_llvm_builder.basic_block.function + + # Create blocks for the then and else cases. Insert the 'then' block at the + # end of the function. + then_block = function.append_basic_block('then') + else_block = function.append_basic_block('else') + merge_block = function.append_basic_block('ifcond') + + g_llvm_builder.cbranch(condition_bool, then_block, else_block) + + # Emit then value. + g_llvm_builder.position_at_end(then_block) + then_value = self.then_branch.CodeGen() + g_llvm_builder.branch(merge_block) + + # Codegen of 'Then' can change the current block; update then_block for the + # PHI node. + then_block = g_llvm_builder.basic_block + + # Emit else block. + g_llvm_builder.position_at_end(else_block) + else_value = self.else_branch.CodeGen() + g_llvm_builder.branch(merge_block) + + # Codegen of 'Else' can change the current block, update else_block for the + # PHI node. + else_block = g_llvm_builder.basic_block + + # Emit merge block. + g_llvm_builder.position_at_end(merge_block) + phi = g_llvm_builder.phi(Type.double(), 'iftmp') + phi.add_incoming(then_value, then_block) + phi.add_incoming(else_value, else_block) + + return phi + +# Expression class for for/in. +class ForExpressionNode(ExpressionNode): + + def __init__(self, loop_variable, start, end, step, body): + self.loop_variable = loop_variable + self.start = start + self.end = end + self.step = step + self.body = body + + def CodeGen(self): + # Output this as: + # var = alloca double + # ... + # start = startexpr + # store start -> var + # goto loop + # loop: + # ... + # bodyexpr + # ... + # loopend: + # step = stepexpr + # endcond = endexpr + # + # curvar = load var + # nextvar = curvar + step + # store nextvar -> var + # br endcond, loop, endloop + # outloop: + + function = g_llvm_builder.basic_block.function + + # Create an alloca for the variable in the entry block. + alloca = CreateEntryBlockAlloca(function, self.loop_variable) + + # Emit the start code first, without 'variable' in scope. + start_value = self.start.CodeGen() + + # Store the value into the alloca. + g_llvm_builder.store(start_value, alloca) + + # Make the new basic block for the loop, inserting after current block. + loop_block = function.append_basic_block('loop') + + # Insert an explicit fall through from the current block to the loop_block. + g_llvm_builder.branch(loop_block) + + # Start insertion in loop_block. + g_llvm_builder.position_at_end(loop_block) + + # Within the loop, the variable is defined equal to the alloca. If it + # shadows an existing variable, we have to restore it, so save it now. + old_value = g_named_values.get(self.loop_variable, None) + g_named_values[self.loop_variable] = alloca + + # Emit the body of the loop. This, like any other expr, can change the + # current BB. Note that we ignore the value computed by the body. + self.body.CodeGen() + + # Emit the step value. + if self.step: + step_value = self.step.CodeGen() + else: + # If not specified, use 1.0. + step_value = Constant.real(Type.double(), 1) + + # Compute the end condition. + end_condition = self.end.CodeGen() + + # Reload, increment, and restore the alloca. This handles the case where + # the body of the loop mutates the variable. + cur_value = g_llvm_builder.load(alloca, self.loop_variable) + next_value = g_llvm_builder.fadd(cur_value, step_value, 'nextvar') + g_llvm_builder.store(next_value, alloca) + + # Convert condition to a bool by comparing equal to 0.0. + end_condition_bool = g_llvm_builder.fcmp( + FCMP_ONE, end_condition, Constant.real(Type.double(), 0), 'loopcond') + + # Create the "after loop" block and insert it. + after_block = function.append_basic_block('afterloop') + + # Insert the conditional branch into the end of loop_block. + g_llvm_builder.cbranch(end_condition_bool, loop_block, after_block) + + # Any new code will be inserted in after_block. + g_llvm_builder.position_at_end(after_block) + + # Restore the unshadowed variable. + if old_value is not None: + g_named_values[self.loop_variable] = old_value + else: + del g_named_values[self.loop_variable] + + # for expr always returns 0.0. + return Constant.real(Type.double(), 0) + +# Expression class for a unary operator. +class UnaryExpressionNode(ExpressionNode): + + def __init__(self, operator, operand): + self.operator = operator + self.operand = operand + + def CodeGen(self): + operand = self.operand.CodeGen() + function = g_llvm_module.get_function_named('unary' + self.operator) + return g_llvm_builder.call(function, [operand], 'unop') + +# Expression class for var/in. +class VarExpressionNode(ExpressionNode): + + def __init__(self, variables, body): + self.variables = variables + self.body = body + + def CodeGen(self): + old_bindings = {} + function = g_llvm_builder.basic_block.function + + # Register all variables and emit their initializer. + for var_name, var_expression in self.variables.iteritems(): + # Emit the initializer before adding the variable to scope, this prevents + # the initializer from referencing the variable itself, and permits stuff + # like this: + # var a = 1 in + # var a = a in ... # refers to outer 'a'. + if var_expression is not None: + var_value = var_expression.CodeGen() + else: + var_value = Constant.real(Type.double(), 0) + + alloca = CreateEntryBlockAlloca(function, var_name) + g_llvm_builder.store(var_value, alloca) + + # Remember the old variable binding so that we can restore the binding + # when we unrecurse. + old_bindings[var_name] = g_named_values.get(var_name, None) + + # Remember this binding. + g_named_values[var_name] = alloca + + # Codegen the body, now that all vars are in scope. + body = self.body.CodeGen() + + # Pop all our variables from scope. + for var_name in self.variables: + if old_bindings[var_name] is not None: + g_named_values[var_name] = old_bindings[var_name] + else: + del g_named_values[var_name] + + # Return the body computation. + return body + +# This class represents the "prototype" for a function, which captures its name, +# and its argument names (thus implicitly the number of arguments the function +# takes), as well as if it is an operator. +class PrototypeNode(object): + + def __init__(self, name, args, is_operator=False, precedence=0): + self.name = name + self.args = args + self.is_operator = is_operator + self.precedence = precedence + + def IsBinaryOp(self): + return self.is_operator and len(self.args) == 2 + + def GetOperatorName(self): + assert self.is_operator + return self.name[-1] + + def CodeGen(self): + # Make the function type, eg. double(double,double). + funct_type = Type.function( + Type.double(), [Type.double()] * len(self.args), False) + + function = Function.new(g_llvm_module, funct_type, self.name) + + # If the name conflicted, there was already something with the same name. + # If it has a body, don't allow redefinition or reextern. + if function.name != self.name: + function.delete() + function = g_llvm_module.get_function_named(self.name) + + # If the function already has a body, reject this. + if not function.is_declaration: + raise RuntimeError('Redefinition of function.') + + # If the function took a different number of args, reject. + if len(function.args) != len(self.args): + raise RuntimeError('Redeclaration of a function with different number ' + 'of args.') + + # Set names for all arguments and add them to the variables symbol table. + for arg, arg_name in zip(function.args, self.args): + arg.name = arg_name + + return function + + # Create an alloca for each argument and register the argument in the symbol + # table so that references to it will succeed. + def CreateArgumentAllocas(self, function): + for arg_name, arg in zip(self.args, function.args): + alloca = CreateEntryBlockAlloca(function, arg_name) + g_llvm_builder.store(arg, alloca) + g_named_values[arg_name] = alloca + +# This class represents a function definition itself. +class FunctionNode(object): + + def __init__(self, prototype, body): + self.prototype = prototype + self.body = body + + def CodeGen(self): + # Clear scope. + g_named_values.clear() + + # Create a function object. + function = self.prototype.CodeGen() + + # If this is a binary operator, install its precedence. + if self.prototype.IsBinaryOp(): + operator = self.prototype.GetOperatorName() + g_binop_precedence[operator] = self.prototype.precedence + + # Create a new basic block to start insertion into. + block = function.append_basic_block('entry') + global g_llvm_builder + g_llvm_builder = Builder.new(block) + + # Add all arguments to the symbol table and create their allocas. + self.prototype.CreateArgumentAllocas(function) + + # Finish off the function. + try: + return_value = self.body.CodeGen() + g_llvm_builder.ret(return_value) + + # Validate the generated code, checking for consistency. + function.verify() + + # Optimize the function. + g_llvm_pass_manager.run(function) + except: + function.delete() + if self.prototype.IsBinaryOp(): + del g_binop_precedence[self.prototype.GetOperatorName()] + raise + + return function + + +################################################################################ +## Parser +################################################################################ + +class Parser(object): + + def __init__(self, tokens): + self.tokens = tokens + self.Next() + + # Provide a simple token buffer. Parser.current is the current token the + # parser is looking at. Parser.Next() reads another token from the lexer and + # updates Parser.current with its results. + def Next(self): + self.current = self.tokens.next() + + # Gets the precedence of the current token, or -1 if the token is not a binary + # operator. + def GetCurrentTokenPrecedence(self): + if isinstance(self.current, CharacterToken): + return g_binop_precedence.get(self.current.char, -1) + else: + return -1 + + # identifierexpr ::= identifier | identifier '(' expression* ')' + def ParseIdentifierExpr(self): + identifier_name = self.current.name + self.Next() # eat identifier. + + if self.current != CharacterToken('('): # Simple variable reference. + return VariableExpressionNode(identifier_name) + + # Call. + self.Next() # eat '('. + args = [] + if self.current != CharacterToken(')'): + while True: + args.append(self.ParseExpression()) + if self.current == CharacterToken(')'): + break + elif self.current != CharacterToken(','): + raise RuntimeError('Expected ")" or "," in argument list.') + self.Next() + + self.Next() # eat ')'. + return CallExpressionNode(identifier_name, args) + + # numberexpr ::= number + def ParseNumberExpr(self): + result = NumberExpressionNode(self.current.value) + self.Next() # consume the number. + return result + + # parenexpr ::= '(' expression ')' + def ParseParenExpr(self): + self.Next() # eat '('. + + contents = self.ParseExpression() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")".') + self.Next() # eat ')'. + + return contents + + # ifexpr ::= 'if' expression 'then' expression 'else' expression + def ParseIfExpr(self): + self.Next() # eat the if. + + # condition. + condition = self.ParseExpression() + + if not isinstance(self.current, ThenToken): + raise RuntimeError('Expected "then".') + self.Next() # eat the then. + + then_branch = self.ParseExpression() + + if not isinstance(self.current, ElseToken): + raise RuntimeError('Expected "else".') + self.Next() # eat the else. + + else_branch = self.ParseExpression() + + return IfExpressionNode(condition, then_branch, else_branch) + + # forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression + def ParseForExpr(self): + self.Next() # eat the for. + + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected identifier after for.') + + loop_variable = self.current.name + self.Next() # eat the identifier. + + if self.current != CharacterToken('='): + raise RuntimeError('Expected "=" after for variable.') + self.Next() # eat the '='. + + start = self.ParseExpression() + + if self.current != CharacterToken(','): + raise RuntimeError('Expected "," after for start value.') + self.Next() # eat the ','. + + end = self.ParseExpression() + + # The step value is optional. + if self.current == CharacterToken(','): + self.Next() # eat the ','. + step = self.ParseExpression() + else: + step = None + + if not isinstance(self.current, InToken): + raise RuntimeError('Expected "in" after for variable specification.') + self.Next() # eat 'in'. + + body = self.ParseExpression() + + return ForExpressionNode(loop_variable, start, end, step, body) + + # varexpr ::= 'var' (identifier ('=' expression)?)+ 'in' expression + def ParseVarExpr(self): + self.Next() # eat 'var'. + + variables = {} + + # At least one variable name is required. + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected identifier after "var".') + + while True: + var_name = self.current.name + self.Next() # eat the identifier. + + # Read the optional initializer. + if self.current == CharacterToken('='): + self.Next() # eat '='. + variables[var_name] = self.ParseExpression() + else: + variables[var_name] = None + + # End of var list, exit loop. + if self.current != CharacterToken(','): + break + self.Next() # eat ','. + + if not isinstance(self.current, IdentifierToken): + raise RuntimeError('Expected identifier after "," in a var expression.') + + # At this point, we have to have 'in'. + if not isinstance(self.current, InToken): + raise RuntimeError('Expected "in" keyword after "var".') + self.Next() # eat 'in'. + + body = self.ParseExpression() + + return VarExpressionNode(variables, body) + + # primary ::= + # dentifierexpr | numberexpr | parenexpr | ifexpr | forexpr | varexpr + def ParsePrimary(self): + if isinstance(self.current, IdentifierToken): + return self.ParseIdentifierExpr() + elif isinstance(self.current, NumberToken): + return self.ParseNumberExpr() + elif isinstance(self.current, IfToken): + return self.ParseIfExpr() + elif isinstance(self.current, ForToken): + return self.ParseForExpr() + elif isinstance(self.current, VarToken): + return self.ParseVarExpr() + elif self.current == CharacterToken('('): + return self.ParseParenExpr() + else: + raise RuntimeError('Unknown token when expecting an expression.') + + # unary ::= primary | unary_operator unary + def ParseUnary(self): + # If the current token is not an operator, it must be a primary expression. + if (not isinstance(self.current, CharacterToken) or + self.current in [CharacterToken('('), CharacterToken(',')]): + return self.ParsePrimary() + + # If this is a unary operator, read it. + operator = self.current.char + self.Next() # eat the operator. + return UnaryExpressionNode(operator, self.ParseUnary()) + + # binoprhs ::= (binary_operator unary)* + def ParseBinOpRHS(self, left, left_precedence): + # If this is a binary operator, find its precedence. + while True: + precedence = self.GetCurrentTokenPrecedence() + + # If this is a binary operator that binds at least as tightly as the + # current one, consume it; otherwise we are done. + if precedence < left_precedence: + return left + + binary_operator = self.current.char + self.Next() # eat the operator. + + # Parse the unary expression after the binary operator. + right = self.ParseUnary() + + # If binary_operator binds less tightly with right than the operator after + # right, let the pending operator take right as its left. + next_precedence = self.GetCurrentTokenPrecedence() + if precedence < next_precedence: + right = self.ParseBinOpRHS(right, precedence + 1) + + # Merge left/right. + left = BinaryOperatorExpressionNode(binary_operator, left, right) + + # expression ::= unary binoprhs + def ParseExpression(self): + left = self.ParseUnary() + return self.ParseBinOpRHS(left, 0) + + # prototype + # ::= id '(' id* ')' + # ::= binary LETTER number? (id, id) + # ::= unary LETTER (id) + def ParsePrototype(self): + precedence = None + if isinstance(self.current, IdentifierToken): + kind = 'normal' + function_name = self.current.name + self.Next() # eat function name. + elif isinstance(self.current, UnaryToken): + kind = 'unary' + self.Next() # eat 'unary'. + if not isinstance(self.current, CharacterToken): + raise RuntimeError('Expected an operator after "unary".') + function_name = 'unary' + self.current.char + self.Next() # eat the operator. + elif isinstance(self.current, BinaryToken): + kind = 'binary' + self.Next() # eat 'binary'. + if not isinstance(self.current, CharacterToken): + raise RuntimeError('Expected an operator after "binary".') + function_name = 'binary' + self.current.char + self.Next() # eat the operator. + if isinstance(self.current, NumberToken): + if not 1 <= self.current.value <= 100: + raise RuntimeError('Invalid precedence: must be in range [1, 100].') + precedence = self.current.value + self.Next() # eat the precedence. + else: + raise RuntimeError('Expected function name, "unary" or "binary" in ' + 'prototype.') + + if self.current != CharacterToken('('): + raise RuntimeError('Expected "(" in prototype.') + self.Next() # eat '('. + + arg_names = [] + while isinstance(self.current, IdentifierToken): + arg_names.append(self.current.name) + self.Next() + + if self.current != CharacterToken(')'): + raise RuntimeError('Expected ")" in prototype.') + + # Success. + self.Next() # eat ')'. + + if kind == 'unary' and len(arg_names) != 1: + raise RuntimeError('Invalid number of arguments for a unary operator.') + elif kind == 'binary' and len(arg_names) != 2: + raise RuntimeError('Invalid number of arguments for a binary operator.') + + return PrototypeNode(function_name, arg_names, kind != 'normal', precedence) + + # definition ::= 'def' prototype expression + def ParseDefinition(self): + self.Next() # eat def. + proto = self.ParsePrototype() + body = self.ParseExpression() + return FunctionNode(proto, body) + + # toplevelexpr ::= expression + def ParseTopLevelExpr(self): + proto = PrototypeNode('', []) + return FunctionNode(proto, self.ParseExpression()) + + # external ::= 'extern' prototype + def ParseExtern(self): + self.Next() # eat extern. + return self.ParsePrototype() + + # Top-Level parsing + def HandleDefinition(self): + self.Handle(self.ParseDefinition, 'Read a function definition:') + + def HandleExtern(self): + self.Handle(self.ParseExtern, 'Read an extern:') + + def HandleTopLevelExpression(self): + try: + function = self.ParseTopLevelExpr().CodeGen() + result = g_llvm_executor.run_function(function, []) + print 'Evaluated to:', result.as_real(Type.double()) + except Exception, e: + raise#print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + + def Handle(self, function, message): + try: + print message, function().CodeGen() + except Exception, e: + raise#print 'Error:', e + try: + self.Next() # Skip for error recovery. + except: + pass + +################################################################################ +## Main driver code. +################################################################################ + +def main(): + # Set up the optimizer pipeline. Start with registering info about how the + # target lays out data structures. + g_llvm_pass_manager.add(g_llvm_executor.target_data) + # Promote allocas to registers. + g_llvm_pass_manager.add(PASS_PROMOTE_MEMORY_TO_REGISTER) + # Do simple "peephole" optimizations and bit-twiddling optzns. + g_llvm_pass_manager.add(PASS_INSTRUCTION_COMBINING) + # Reassociate expressions. + g_llvm_pass_manager.add(PASS_REASSOCIATE) + # Eliminate Common SubExpressions. + g_llvm_pass_manager.add(PASS_GVN) + # Simplify the control flow graph (deleting unreachable blocks, etc). + g_llvm_pass_manager.add(PASS_CFG_SIMPLIFICATION) + + g_llvm_pass_manager.initialize() + + # Install standard binary operators. + # 1 is lowest possible precedence. 40 is the highest. + g_binop_precedence['='] = 2 + g_binop_precedence['<'] = 10 + g_binop_precedence['+'] = 20 + g_binop_precedence['-'] = 20 + g_binop_precedence['*'] = 40 + + # Run the main "interpreter loop". + while True: + print 'ready<', + try: + raw = raw_input() + except KeyboardInterrupt: + break + + parser = Parser(Tokenize(raw)) + while True: + # top ::= definition | external | expression | EOF + if isinstance(parser.current, EOFToken): + break + if isinstance(parser.current, DefToken): + parser.HandleDefinition() + elif isinstance(parser.current, ExternToken): + parser.HandleExtern() + else: + parser.HandleTopLevelExpression() + + # Print out all of the generated code. + print '\n', g_llvm_module + +if __name__ == '__main__': + main() +{% endhighlight %} + +* * * + +**[Next: Conclusion and other useful LLVM tidbits](PythonLangImpl8.html)** + + diff --git a/docs/source/doc/kaleidoscope/PythonLangImpl8.md b/docs/source/doc/kaleidoscope/PythonLangImpl8.md new file mode 100644 index 0000000..def3ded --- /dev/null +++ b/docs/source/doc/kaleidoscope/PythonLangImpl8.md @@ -0,0 +1,275 @@ +--- +layout: page +title: "Kaleidoscope: Chapter 8" +--- + +# Conclusion and other useful LLVM tidbits + +Written by [Chris Lattner](mailto:sabre@nondot.org) + +**Chapter 8** + + +* This will become a table of contents (this text will be scraped). +{:toc} + +# Tutorial Conclusion # {#conclusion} + +Welcome to the the final chapter of the +[Implementing a language with LLVM](http://www.llvm.org/docs/tutorial/index.html) +tutorial. In the course of this tutorial, we have grown +our little Kaleidoscope language from being a useless toy, to being a +semi-interesting (but probably still useless) toy. :) + +It is interesting to see how far we've come, and how little code it has +taken. We built the entire lexer, parser, AST, code generator, and an +interactive run-loop (with a JIT!) by-hand in under 540 lines of +(non-comment/non-blank) code. + +Our little language supports a couple of interesting features: it supports +user defined binary and unary operators, it uses JIT compilation for immediate +evaluation, and it supports a few control flow constructs with SSA construction. + + +Part of the idea of this tutorial was to show you how easy and fun it can be +to define, build, and play with languages. Building a compiler need not be a +scary or mystical process! Now that you've seen some of the basics, I strongly +encourage you to take the code and hack on it. For example, try adding: + + + +* **global variables** -- While global variables have questional value in + modern software engineering, they are often useful when putting together quick + little hacks like the Kaleidoscope compiler itself. Fortunately, our current + setup makes it very easy to add global variables: just have value lookup check + to see if an unresolved variable is in the global variable symbol table before + rejecting it. To create a new global variable, make an instance of the LLVM + `GlobalVariable` class. + +* **typed variables** -- Kaleidoscope currently only supports variables of + type double. This gives the language a very nice elegance, because only + supporting one type means that you never have to specify types. Different + languages have different ways of handling this. The easiest way is to require + the user to specify types for every variable definition, and record the type + of the variable in the symbol table along with its Value\*. + +* **arrays, structs, vectors, etc** -- Once you add types, you can start + extending the type system in all sorts of interesting ways. Simple arrays are + very easy and are quite useful for many different applications. Adding them is + mostly an exercise in learning how the LLVM + [getelementptr](http://www.llvm.org/docs/LangRef.html#i_getelementptr) + instruction works: it is so nifty/unconventional, it + [has its own FAQ](http://www.llvm.org/docs/GetElementPtr.html)! If you + add support for recursive types (e.g. linked lists), make sure to read the + [section in the LLVM Programmer's Manual](http://www.llvm.org/docs/ProgrammersManual.html#TypeResolve) + that describes how to construct them. + +* **standard runtime** -- Our current language allows the user to access + arbitrary external functions, and we use it for things like "putchard". As you + extend the language to add higher-level constructs, often these constructs make + the most sense if they are lowered to calls into a language-supplied runtime. + For example, if you add hash tables to the language, it would probably make + sense to add the routines to a runtime, instead of inlining them all the way. + +* **memory management** -- Currently we can only access the stack in + Kaleidoscope. It would also be useful to be able to allocate heap memory, + either with calls to the standard libc malloc/free interface or with a garbage + collector. If you would like to use garbage collection, note that LLVM fully + supports + [Accurate Garbage Collection](http://www.llvm.org/docs/GarbageCollection.html) + including algorithms that move objects and need to + scan/update the stack. + +* **debugger support** -- LLVM supports generation of + [DWARF Debug info](http://www.llvm.org/docs/SourceLevelDebugging.html) + which is understood by common debuggers like GDB. Adding support for debug + info is fairly straightforward. The best way to understand it is to compile + some C/C++ code with "`llvm-gcc -g -O0`" and taking a look at + what it produces. + +* **exception handling support** - LLVM supports generation of + [zero cost exceptions](http://www.llvm.org/docs/ExceptionHandling.html) + which interoperate with code compiled in other languages. You could also + generate code by implicitly making every function return an error value and + checking it. You could also make explicit use of setjmp/longjmp. There are + many different ways to go here. + +* **object orientation, generics, database access, complex numbers, + geometric programming, ...** -- Really, there is + no end of crazy features that you can add to the language. + +* **unusual domains** -- We've been talking about applying LLVM to a domain + that many people are interested in: building a compiler for a specific language. + However, there are many other domains that can use compiler technology that are + not typically considered. For example, LLVM has been used to implement OpenGL + graphics acceleration, translate C++ code to ActionScript, and many other + cute and clever things. Maybe you will be the first to JIT compile a regular + expression interpreter into native code with LLVM? + + +Have fun - try doing something crazy and unusual. Building a language like +everyone else always has, is much less fun than trying something a little crazy +or off the wall and seeing how it turns out. If you get stuck or want to talk +about it, feel free to email the +[llvmdev mailing list](http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev): +it has lots of people who are interested in languages and are often +willing to help out. + + +Before we end this tutorial, I want to talk about some "tips and tricks" for +generating LLVM IR. These are some of the more subtle things that may not be +obvious, but are very useful if you want to take advantage of LLVM's +capabilities. + +* * * + +# Properties of the LLVM IR # {#llvmirproperties} + +We have a couple common questions about code in the LLVM IR form - let's just +get these out of the way right now, shall we? + +## Target Independence ## {#targetindep} + +Kaleidoscope is an example of a "portable language": any program written in +Kaleidoscope will work the same way on any target that it runs on. Many other +languages have this property, e.g. LISP, Java, Haskell, Javascript, Python, etc. +(note that while these languages are portable, not all their libraries are). + +One nice aspect of LLVM is that it is often capable of preserving target +independence in the IR: you can take the LLVM IR for a Kaleidoscope-compiled +program and run it on any target that LLVM supports, even emitting C code and +compiling that on targets that LLVM doesn't support natively. You can trivially +tell that the Kaleidoscope compiler generates target-independent code because it +never queries for any target-specific information when generating code. + +The fact that LLVM provides a compact, target-independent, representation for +code gets a lot of people excited. Unfortunately, these people are usually +thinking about C or a language from the C family when they are asking questions +about language portability. I say "unfortunately", because there is really no +way to make (fully general) C code portable, other than shipping the source code +around (and of course, C source code is not actually portable in general +either - ever port a really old application from 32- to 64-bits?). + +The problem with C (again, in its full generality) is that it is heavily +laden with target specific assumptions. As one simple example, the preprocessor +often destructively removes target-independence from the code when it processes +the input text: + + +{% highlight c %} +#ifdef __i386__ + int X = 1; +#else + int X = 42; +#endif +{% endhighlight %} + + +While it is possible to engineer more and more complex solutions to problems +like this, it cannot be solved in full generality in a way that is better than +shipping the actual source code. + +That said, there are interesting subsets of C that can be made portable. If +you are willing to fix primitive types to a fixed size (say int = 32-bits, +and long = 64-bits), don't care about ABI compatibility with existing binaries, +and are willing to give up some other minor features, you can have portable +code. This can make sense for specialized domains such as an +in-kernel language. + + +## Safety Guarantees ## {#safety} + +Many of the languages above are also "safe" languages: it is impossible for +a program written in Java to corrupt its address space and crash the process +(assuming the JVM has no bugs). +Safety is an interesting property that requires a combination of language +design, runtime support, and often operating system support. + +It is certainly possible to implement a safe language in LLVM, but LLVM IR +does not itself guarantee safety. The LLVM IR allows unsafe pointer casts, +use after free bugs, buffer over-runs, and a variety of other problems. Safety +needs to be implemented as a layer on top of LLVM and, conveniently, several +groups have investigated this. Ask on the +[llvmdev mailing list](http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev) +if you are interested in more details. + + +## Language-Specific Optimizations ## {#langspecific} + +One thing about LLVM that turns off many people is that it does not solve all +the world's problems in one system (sorry 'world hunger', someone else will have +to solve you some other day). One specific complaint is that people perceive +LLVM as being incapable of performing high-level language-specific optimization: +LLVM "loses too much information". + +Unfortunately, this is really not the place to give you a full and unified +version of "Chris Lattner's theory of compiler design". Instead, I'll make a +few observations: + +First, you're right that LLVM does lose information. For example, as of this +writing, there is no way to distinguish in the LLVM IR whether an SSA-value came +from a C "int" or a C "long" on an ILP32 machine (other than debug info). Both +get compiled down to an 'i32' value and the information about what it came from +is lost. The more general issue here, is that the LLVM type system uses +"structural equivalence" instead of "name equivalence". Another place this +surprises people is if you have two types in a high-level language that have the +same structure (e.g. two different structs that have a single int field): these +types will compile down into a single LLVM type and it will be impossible to +tell what it came from. + +Second, while LLVM does lose information, LLVM is not a fixed target: we +continue to enhance and improve it in many different ways. In addition to +adding new features (LLVM did not always support exceptions or debug info), we +also extend the IR to capture important information for optimization (e.g. +whether an argument is sign or zero extended, information about pointers +aliasing, etc). Many of the enhancements are user-driven: people want LLVM to +include some specific feature, so they go ahead and extend it. + +Third, it is *possible and easy* to add language-specific +optimizations, and you have a number of choices in how to do it. As one trivial +example, it is easy to add language-specific optimization passes that +"know" things about code compiled for a language. In the case of the C family, +there is an optimization pass that "knows" about the standard C library +functions. If you call "exit(0)" in main(), it knows that it is safe to +optimize that into "return 0;" because C specifies what the 'exit' +function does. + +In addition to simple library knowledge, it is possible to embed a variety of +other language-specific information into the LLVM IR. If you have a specific +need and run into a wall, please bring the topic up on the llvmdev list. At the +very worst, you can always treat LLVM as if it were a "dumb code generator" and +implement the high-level optimizations you desire in your front-end, on the +language-specific AST. + +* * * + +# Tips and Tricks # {#tipsandtricks} + +There is a variety of useful tips and tricks that you come to know after +working on/with LLVM that aren't obvious at first glance. Instead of letting +everyone rediscover them, this section talks about some of these issues. + +## Implementing portable offsetof/sizeof ## {#offsetofsizeof} + +One interesting thing that comes up, if you are trying to keep the code +generated by your compiler "target independent", is that you often need to know +the size of some LLVM type or the offset of some field in an llvm structure. +For example, you might need to pass the size of a type into a function that +allocates memory. + +Unfortunately, this can vary widely across targets: for example the width of +a pointer is trivially target-specific. However, there is a +[clever way to use the getelementptr instruction](http://nondot.org/sabre/LLVMNotes/SizeOf-OffsetOf-VariableSizedStructs.txt) +that allows you to compute this in a portable way. + +## Garbage Collected Stack Frames ## {#gcstack} + +Some languages want to explicitly manage their stack frames, often so that +they are garbage collected or to allow easy implementation of closures. There +are often better ways to implement these features than explicit stack frames, +but [LLVM does support them](http://nondot.org/sabre/LLVMNotes/ExplicitlyManagedStackFrames.txt), +if you want. It requires your front-end to convert the code into +[Continuation Passing Style](http://en.wikipedia.org/wiki/Continuation-passing_style) +and the use of tail calls (which LLVM also supports). + + diff --git a/docs/source/doc/llvm-py_package.md b/docs/source/doc/llvm-py_package.md new file mode 100644 index 0000000..d314b6a --- /dev/null +++ b/docs/source/doc/llvm-py_package.md @@ -0,0 +1,92 @@ +--- +layout: page +title: The llvm-py Package +--- + +The llvm-py is a Python package, consisting of 6 modules, that wrap +over enough LLVM APIs to allow the implementation of your own +compiler/VM backend in pure Python. If you're come this far, you +probably know why this is a good idea. + +Out of the 6 modules, one is an "extension" module (i.e., it is +written in C), and another one is a small private utility module, which +leaves 4 public modules. These are: + +- *llvm* -- top-level package, common classes (like exceptions) +- *llvm.core* -- IR-related APIs +- *llvm.ee* -- execution engine related APIs +- *llvm.passes* -- pass manager and passes related APIs + +The modules contain only classes and (integer) constants. Mostly simple +Python constructs are used (deliberately) -- +[property()](http://docs.python.org/lib/built-in-funcs.html) and +[property decorators](http://wiki.python.org/moin/PythonDecoratorLibrary) are probably the most exotic animals around. All classes are +"new style" classes. The APIs are designed to be navigable (and +guessable!) once you know a few conventions. These conventions are +highlighted in the sections below. + +Here is a quick overview of the contents of each package: + + +## llvm + +- LLVMException -- exception class (currently the only one) + +## llvm.core +- [Module](llvm.core.Module.html) -- represents an LLVM Module +- [Type](types.html) -- represents an LLVM Type +- [Value](values.html) -- represents an LLVM Value, including: + globals, constants, variables, arguments, functions, instructions, etc.. +- [BasicBlock](llvm.core.BasicBlock.html) -- another derived of Value, + represents an LLVM basic block +- [Builder](llvm.core.Builder.html) -- used for creating instructions, + wraps LLVM IRBuilder helper + class +- constants *TYPE_\** that represents various types +- constants *CC_\** that represent calling conventions +- constants *ICMP_\** and *FCMP_\** that represent integer and real + comparison predicates (like less than, greater than etc.) +- constants *LINKAGE_\** that represent linkage of symbols (external, + internal etc.) +- constants *VISIBILITY_\** that represents visibility of symbols + (default, hidden, protected) +- constants *ATTR_\** that represent function parameter attributes + +## llvm.ee +- [ExecutionEngine](llvm.ee.ExecutionEngine.html) + -- represents an execution engine (which can be an + either an interpreter or a JIT) +- [TargetData](llvm.ee.TargetData.html) + -- represents the ABI of the target platform (details like + sizes and alignment of primitive types, endinanness etc) + +## llvm.passes +- [PassManager](llvm.passes.PassManager.html) + -- represents an LLVM pass manager +- [FunctionPassManager](llvm.passes.FunctionPassManager.html) + -- represents an LLVM function pass manager +- constants *PASS_\** that represent various passes + +## A note on the importing of these modules +Pythonically, modules are imported with the statement `import +llvm.core`. However, you might find it more convenient to import +llvm-py modules thus: + +{% highlight python %} +from llvm import * +from llvm.core import * +from llvm.ee import * +from llvm.passes import * +{% endhighlight %} + +This avoids quite some typing. Both conventions work, however. + +> **Tip** +> +> +> Python-style documentation strings (`__doc__`) are present in +> llvm-py. You can use the `help()` of the interactive Python +> interpreter or the `object?` of [IPython](http://ipython.scipy.org/moin/) +> to get online help. (Note: not complete yet!) + + diff --git a/docs/source/doc/llvm.core.Argument.md b/docs/source/doc/llvm.core.Argument.md new file mode 100644 index 0000000..d9e8890 --- /dev/null +++ b/docs/source/doc/llvm.core.Argument.md @@ -0,0 +1,57 @@ +--- +layout: page +title: Argument (llvm.core) +--- + +The `args` property of `llvm.core.Function` objects yields +`llvm.core.Argument` objects. This allows for setting attributes for +functions arguments. `Argument` objects cannot be constructed from user +code, the only way to get a reference to these are from `Function` +objects. + +The method `add_attribute` and `remove_attribute` can be used to add or +remove the following attributes: + +Value| Equivalent LLVM Assembly Keyword | +-----|----------------------------------| +`ATTR_ZEXT`| `zeroext` | +`ATTR_SEXT`| `signext` | +`ATTR_IN_REG`| `inreg` | +`ATTR_BY_VAL`| `byval` | +`ATTR_STRUCT_RET`| `sret` | +`ATTR_NO_ALIAS`| `noalias` | +`ATTR_NO_CAPTURE`| `nocapture` | +`ATTR_NEST`| `nest` | + +These method work exactly like the +[corresponding methods](functions.html#fnattr) +of the `Function` class above. Refer +[LLVM docs](http://www.llvm.org/docs/LangRef.html#paramattrs) +for information on what each attribute means. + +The alignment of any argument can be set via the `alignment` +property, to any power of 2. + +# llvm.core.Argument + +## Base Class + +- [llvm.core.Value](llvm.core.Value.html) + +## Properties + +### `alignment` + +The alignment of the argument. Must be a power of 2. + +## Methods + +### `add_attribute(attr)` + +Add an attribute `attr` to the argument, from the set listed above. + +### `remove_attribute(attr)` + +Remove the attribute `attr` of the argument. + + diff --git a/docs/source/doc/llvm.core.ArrayType.md b/docs/source/doc/llvm.core.ArrayType.md new file mode 100644 index 0000000..4aec7bd --- /dev/null +++ b/docs/source/doc/llvm.core.ArrayType.md @@ -0,0 +1,26 @@ +--- +layout: page +title: ArrayType (llvm.core) +--- + + +# llvm.core.ArrayType + +## Base Class + +- [llvm.core.Type](llvm.core.Type.html) + +## Properties + + +### `element` +\[read-only\] + +A `Type` object representing the type of the element of the array. + + +### `count` +\[read-only\] + +The number of elements in the array. + diff --git a/docs/source/doc/llvm.core.BasicBlock.md b/docs/source/doc/llvm.core.BasicBlock.md new file mode 100644 index 0000000..873634c --- /dev/null +++ b/docs/source/doc/llvm.core.BasicBlock.md @@ -0,0 +1,40 @@ +--- +layout: page +title: BasicBlock (llvm.core) +--- + +A basicblock is a list of instructions. +A wellformed basicblock should end with a terminator. +`Function.verify()` will verify that. +A terminator is either a branch instruction or return instruction. +It is not possible to have instructions after a branch or return instruction. + +# llvm.core.BasicBlock + +## Base Class + +- [llvm.core.Value][] + +## Methods + +### `delete(self)` + +Delete this basicblock from the function (`self.function`). + +### `insert_before(self, name)` + +TODO + +## Proporties + +### `function` + +The parent function of this basicblock. + +### `instructions` + +A list of instructions in this basicblock. + + +[llvm.core.Value]: llvm.core.Value.html +[llvm.core.Function]: llvm.core.Function.html diff --git a/docs/source/doc/llvm.core.Builder.md b/docs/source/doc/llvm.core.Builder.md new file mode 100644 index 0000000..75fffd4 --- /dev/null +++ b/docs/source/doc/llvm.core.Builder.md @@ -0,0 +1,326 @@ +--- +layout: page +title: Builder (llvm.core) +--- + +The `Builder` class corresponds to the +[IRBuilder](http://llvm.org/docs/doxygen/html/classllvm_1_1IRBuilder.html) +in C++ llvm. It provides an uniform API to +populating [BasicBlocks][llvm.core.BasicBlock]. +Most of the methods in `Builder` correspond to the instructions in the LLVM IR. +See [LLVM documentation](http://llvm.org/docs/LangRef.html) for detail. +These methods have the `name` argument for overiding the name +of the result variable. When it is an empty string (default value), +LLVM will set a numeric ID for the result variable. + +# llvm.core.Builder + +* This will become a table of contents (this text will be scraped). +{:toc} + +## Static Factor Method + +### `new(basic_block)` + +Create an instance of `Builder` at [BasicBlock][llvm.core.BasicBlock]. + +## Methods + +### `add(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs+rhs` for integer values only. + +### `alloca(self, ty, name='')` + +Insert an instruction that allocates stack memory for a value of type `ty`. + +### `alloca_array(self, ty, size, name='')` + +Insert an instruction that allocates stack memory for a `size` elements array +of type `ty`. + +### `and_(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs & rhs`. + +### `ashr(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs >> rhs` using arithmetic shift. + +### `bitcast(self, value, dest_ty, name='')` + +Insert an instruction that cast `value` to type `dest_ty`. + +### `branch(self, bblk)` + +Insert an instruction that branch to basicblock `bblk`. + +### `call(self, fn, args, name='')` + +Insert an instruction that call function `fn` with a iterable of +arguments `args`. + +### `cbranch(self, if_value, then_blk, else_blk)` + +Insert an instruction that conditionally branch base on +the predicate `if_value`. +If `if_value` is `True`, branch to `then_blk`; +Otherwise, branch to `else_blk`. + +### `extract_element(self, vec_val, idx_val, name='')` + +Insert an instruction that extracts an element from a value `vec_val` of +[llvm.core.VectorType][] at index `idx_val`. + +### `extract_value(self, retval, idx, name='')` + +Insert an instruction that extracts an element from an aggregate value `retval` +at index `idx`. + +### `fadd(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs + rhs` for floating-point values. + +### `fcmp(self, rpred, lhs, rhs, name='')` + +Insert an instruction that compares `lhs` and `rhs` using the comparision +operation defined by `rpred`. +See [here](comparision.html#fcmp) for a list of comparators. + +### `fdiv(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs / rhs` for floating-point values. + +### `fmul(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs * rhs` for floating-point values. + +### `fpext(self, value, dest_ty, name='')` + +Insert an instruction that extends `value` to a float type `dest_ty`. + +### `fptosi(self, value, dest_ty, name='')` + +Insert an instruction that converts a floating-point value `value` +to a signed integer type `dest_ty`. + +### `fptoui(self, value, dest_ty, name='')` + +Insert an instruction that converts a floating-point value `value` +to an unsigned integer type `dest_ty`. + +### `fptrunc(self, value, dest_ty, name='')` + +Insert an instruction that truncates a floating-point value `value` +to a float type `dest_ty`. + +### `free(self, ptr)` + +Insert an instruction that call performs heap deallocation on pointer `ptr`. + +### `frem(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs % rhs` for floating-point values. + +### `fsub(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs - rhs` for floating-point values. + +### `gep(self, ptr, indices, name='')` + +See [GEP](http://llvm.org/docs/LangRef.html#i_getelementptr). + +### `getresult(self, retval, idx, name='')` + +same as `extract_value`. + +### `icmp(self, ipred, lhs, rhs, name='')` + +Insert an instruction that compares `lhs` and `rhs` using the comparision +operation defined by `ipred`. +See [here](comparision.html#icmp) for a list of comparators. + +### `insert_element(self, vec_val, elt_val, idx_val, name='')` + +Insert an instruction that inserts a value `elt_val` into `vec_val` of +[llvm.core.VectorType][] at index `idx_val`. + +### `inttoptr(self, value, dest_ty, name='')` + +Insert an instruction that converts an integer `value` to pointer `dest_ty`. + +### `invoke(self, func, args, then_blk, catch_blk, name='')` + +See [invoke](http://llvm.org/docs/LangRef.html#i_invoke) + +### `load(self, ptr, name='')` + +Insert an instruction that loads a value at the memory pointed by `ptr`. + +### `lshr(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs >> rhs` using logical shift. + +### `malloc(self, ty, name='')` + +Insert an instruction that allocates heap memory of type `ty`. +The instruction returns a pointer that points to a value of type `ty`. + +### `malloc_array(self, ty, size, name='')` + +Similar to `malloc` but allocates an array of `size` elements. + +### `mul(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs * rhs` for integer types. + +### `neg(self, val, name='')` + +Insert an instruction that computes `0 - val`. + +### `not_(self, val, name='')` + +Insert an instruction that computes an one's complement of `val`. + +### `or_(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs | rhs`. + +### `phi(self, ty, name='')` + +Create a PHI node of type `ty`. + +### `position_at_beginning(self, bblk)` +Position the builder at the beginning of the given block. +Next instruction inserted will be first one in the block. + +### `position_at_end(self, bblk)` + +Position the builder at the end of the given block. +Next instruction inserted will be last one in the block. + +### `position_before(self, instr)` + +Position the builder before the given instruction. +The instruction can belong to a basic block other than the current one. + +### `ptrtoint(self, value, dest_ty, name='')` + +Insert an instruction that converts a pointer to an integer `value` of +type `dest_ty`. + +### `ret(self, value)` + +Insert an instruction that returns `value`. + +### `ret_many(self, values)` + +Insert an instruction that returns `values` which is an iterable of +[llvm.core.Value][]. + +### `ret_void(self)` + +Insert an instruction that returns nothing (void). + +### `sdiv(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs / rhs` for signed integers. + +### `select(self, cond, then_value, else_value, name='')` + +Insert an instruction that computes `cond ? then_value : else_value`. + +### `sext(self, value, dest_ty, name='')` + +Insert an instruction that sign extends an integer `value` to type `dest_ty`. + +### `shl(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs << rhs`. + +### `shuffle_vector(self, vecA, vecB, mask, name='')` + +Insert an instruction that performs a vector shuffle base on the two vectors -- +`vecA` and `vecB`, base on a bit mask `mask`. The mask must be a constant. + +See [LLVM document](http://llvm.org/docs/LangRef.html#i_shufflevector) +for detail. + +### `sitofp(self, value, dest_ty, name='')` + +Insert an instruction that converts a signed integer `value` to a floating-point +type `dest_ty`. + +### `srem(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs % rhs` for signed integers. + +### `store(self, value, ptr)` + +Insert an instruction that stores `value` into the memory pointed by `ptr`. + +### `sub(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs - rhs`. + +### `switch(self, value, else_blk, n=10)` + +Insert an instruction that transfer control flow depending on the `value`. +`else_blk` is the default case. `n` sets the number of additional cases. + +This method returns an instance of +[SwitchInstruction](llvm.core.Instruction.html#switchinstr) +for adding cases to the switch. + +### `trunc(self, value, dest_ty, name='')` + +Insert an instruction that truncates an integer `value` to the destination +integer type `dest_ty`. + +### `udiv(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs / rhs` for unsigned integers. + +### `uitofp(self, value, dest_ty, name='')` + +Insert an instruction that converts an unsigned integer `value` to a +floating-point type `dest_ty`. + +### `unreachable(self)` + +Insert an unreachabe instruction, which has no defined semantics. +See [LLVM document](http://llvm.org/docs/LangRef.html#i_unreachable) +for detail. + +### `urem(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs % rhs` for unsigned integers. + +### `vaarg(self, list_val, ty, name='')` + +This is used to access variable arguments given as `list_val` of type `ty`. +see [LLVM document](http://llvm.org/docs/LangRef.html#int_varargs) +about variable argument intrinsics. + +### `xor(self, lhs, rhs, name='')` + +Insert an instruction that computes `lhs xor rhs`. + +### `zext(self, value, dest_ty, name='')` + +Insert an instruction that zero extends `value` to type `dest_ty`. + +## Properties + +### `basic_block` + +The [BasicBlock][llvm.core.BasicBlock] where the builder is positioned. + +### `block` + +Deprecated. Same as `basic_block` + +[llvm.core.BasicBlock]: llvm.core.BasicBlock.html +[llvm.core.Value]: llvm.core.Value.html +[llvm.core.VectorType]: llvm.core.VectorType.html + diff --git a/docs/source/doc/llvm.core.Constant.md b/docs/source/doc/llvm.core.Constant.md new file mode 100644 index 0000000..23b6671 --- /dev/null +++ b/docs/source/doc/llvm.core.Constant.md @@ -0,0 +1,296 @@ +--- +layout: page +title: Constant (llvm.core) +--- + +`Constant`-s represents constants that appear within the code. The +values of such objects are known at creation time. Constants can be +created from Python constants. A constant expression is also a constant +-- given a `Constant` object, an operation (like addition, subtraction +etc) can be specified, to yield a new `Constant` object. Let's see some +examples: + +{% highlight python %} +#!/usr/bin/env python + +ti = Type.int() # a 32-bit int type + +k1 = Constant.int(ti, 42) # "int k1 = 42;" +k2 = k1.add( Constant.int( ti, 10 ) ) # "int k2 = k1 + 10;" + +tr = Type.float() + +r1 = Constant.real(tr, "3.141592") # create from a string +r2 = Constant.real(tr, 1.61803399) # create from a Python float +{% endhighlight %} + +# llvm.core.Constant + + +* This will become a table of contents (this text will be scraped). +{:toc} + + +## Static factory methods + +### `null(ty)` + +A null value (all zeros) of type `ty` + +### `all_ones(ty)` + +All 1's value of type `ty` + +### `undef(ty)` + +An undefined value of type `ty` + +### `int(ty, value)` + +Integer of type `ty`, with value `value` (a Python int or long) + +### `int_signextend(ty, value)` + +Integer of signed type `ty` (use for signed types) + +### `real(ty, value)` + +Floating point value of type `ty`, with value `value` (a Python float) + +### `stringz(value)` + +A null-terminated string. `value` is a Python string + +### `string(value)` + +As `string(ty)`, but not null terminated + +### `array(ty, consts)` + +Array of type `ty`, initialized with `consts` (an iterable yielding `Constant` +objects of the appropriate type) + +### `struct(ty, consts)` + +Struct (unpacked) of type `ty`, initialized with `consts` (an iterable yielding +`Constant` objects of the appropriate type) + +### `packed_struct(ty, consts)` + +As `struct(ty, consts)` but packed + +### `vector(consts)` + +Vector, initialized with `consts` (an iterable yielding `Constant` objects of +the appropriate type) + +### `sizeof(ty)` + +Constant value representing the sizeof the type `ty` + + +## Methods + +The following operations on constants are supported. For more details on +any operation, consult the +[Constant Expressions](http://www.llvm.org/docs/LangRef.html#constantexprs) +section of the LLVM Language Reference. + +### `k.neg()` + +negation, same as `0 - k` + +### `k.not_()` + +1's complement of `k`. Note trailing underscore. + +### `k.add(k2)` + +`k + k2`, where `k` and `k2` are integers. + + +### `k.fadd(k2)` + +`k + k2`, where `k` and `k2` are floating-point. + +### `k.sub(k2)` + +`k - k2`, where `k` and `k2` are integers. + +### `k.fsub(k2)` + +`k - k2`, where `k` and `k2` are floating-point. + +### `k.mul(k2)` + +`k * k2`, where `k` and `k2` are integers. + +### `k.fmul(k2)` + +`k * k2`, where `k` and `k2` are floating-point. + +### `k.udiv(k2)` + +Quotient of unsigned division of `k` with `k2` + +### `k.sdiv(k2)` + +Quotient of signed division of `k` with `k2` + +### `k.fdiv(k2)` + +Quotient of floating point division of `k` with `k2` + +### `k.urem(k2)` + +Reminder of unsigned division of `k` with `k2` + +### `k.srem(k2)` + +Reminder of signed division of `k` with `k2` + +### `k.frem(k2)` + +Reminder of floating point division of `k` with `k2` + +### `k.and_(k2)` + +Bitwise and of `k` and `k2`. Note trailing underscore. + +### `k.or_(k2)` + +Bitwise or of `k` and `k2`. Note trailing underscore. + +### `k.xor(k2)` + +Bitwise exclusive-or of `k` and `k2`. + +### `k.icmp(icmp, k2)` + +Compare `k` with `k2` using the predicate `icmp`. +See [here](comparision.html#icmp) for +list of predicates for integer operands. + +### `k.fcmp(fcmp, k2)` + +Compare `k` with `k2` using the predicate `fcmp`. +See [here](comparision.html#fcmp) for list +of predicates for real operands. + +### `k.shl(k2)` + +Shift `k` left by `k2` bits. + +### `k.lshr(k2)` + +Shift `k` logically right by `k2` bits (new bits are 0s). + +### `k.ashr(k2)` + +Shift `k` arithmetically right by `k2` bits (new bits are same as previous sign bit). + +### `k.gep(indices)` + +GEP, see [LLVM docs](http://www.llvm.org/docs/GetElementPtr.html). + +### `k.trunc(ty)` + +Truncate `k` to a type `ty` of lower bitwidth. + +### `k.sext(ty)` + +Sign extend `k` to a type `ty` of higher bitwidth, while extending the sign bit. + +### `k.zext(ty)` + +Sign extend `k` to a type `ty` of higher bitwidth, all new bits are 0s. + +### `k.fptrunc(ty)` + +Truncate floating point constant `k` to floating point type `ty` of lower size +than k's. + +### `k.fpext(ty)` + +Extend floating point constant `k` to floating point type `ty` of higher size +than k's. + +### `k.uitofp(ty)` + +Convert an unsigned integer constant `k` to floating point constant of +type `ty`. + +### `k.sitofp(ty)` + +Convert a signed integer constant `k` to floating point constant of type `ty`. + +### `k.fptoui(ty)` + +Convert a floating point constant `k` to an unsigned integer constant of type `ty`. + +### `k.fptosi(ty)` + +Convert a floating point constant `k` to a signed integer constant of type `ty`. + +### `k.ptrtoint(ty)` + +Convert a pointer constant `k` to an integer constant of type `ty`. + +### `k.inttoptr(ty)` + +Convert an integer constant `k` to a pointer constant of type `ty`. + +### `k.bitcast(ty)` + +Convert `k` to a (equal-width) constant of type `ty`. + +### `k.select(cond,k2,k3)` + +Replace value with `k2` if the 1-bit integer constant `cond` is 1, +else with `k3`. + +### `k.extract_element(idx)` + +Extract value at `idx` (integer constant) from a vector constant `k`. + +### `k.insert_element(k2,idx)` + +Insert value `k2` (scalar constant) at index `idx` (integer constant) of vector +constant `k`. + +### `k.shuffle_vector(k2,mask)` + +Shuffle vector constant `k` based on vector constants `k2` and `mask`. + +* * * + +# Other Constant Classes + +The following subclasses of `Constant` do not provide additional +methods, **they serve only to provide richer type information.** + + +Subclass | LLVM C++ Class | Remarks | +---------|----------------|---------| +`ConstantExpr` | `llvmConstantExpr` | A constant expression | +`ConstantAggregateZero`| `llvmConstantAggregateZero`| All-zero constant | +`ConstantInt`| `llvmConstantInt`| An integer constant | +`ConstantFP`| `llvmConstantFP`| A floating-point constant | +`ConstantArray`| `llvmConstantArray`| An array constant | +`ConstantStruct`| `llvmConstantStruct`| A structure constant | +`ConstantVector`| `llvmConstantVector`| A vector constant | +`ConstantPointerNull`| `llvmConstantPointerNull`| All-zero pointer constant | +`UndefValue`| `llvmUndefValue`| corresponds to `undef` of LLVM IR | + + +These types are helpful in `isinstance` checks, like so: + +{% highlight python %} +ti = Type.int(32) +k1 = Constant.int(ti, 42) # int32_t k1 = 42; +k2 = Constant.array(ti, [k1, k1]) # int32_t k2[] = { k1, k1 }; + +assert isinstance(k1, ConstantInt) +assert isinstance(k2, ConstantArray) +{% endhighlight %} + diff --git a/docs/source/doc/llvm.core.Function.md b/docs/source/doc/llvm.core.Function.md new file mode 100644 index 0000000..60e8db9 --- /dev/null +++ b/docs/source/doc/llvm.core.Function.md @@ -0,0 +1,129 @@ +--- +layout: page +title: Function (llvm.core) +--- + +# llvm.core.Function + +* This will become a table of contents (this text will be scraped). +{:toc} + + +## Base Class + +- [llvm.core.GlobalValue](llvm.core.GlobalValue.html) + +## Static Constructors + +### `new(module_obj, func_ty, name)` + +Create a function named `name` of type `func_ty` in the module +`module_obj` and return a `Function` object that represents it. + +### `get(module_obj, name)` + +Return a `Function` object to represent the function +named `name` in the module `module_obj` or raise `LLVMException` if +such a function does not exist. + +### `get_or_insert(module_obj, func_ty, name)` + +Similar to `get`, except that if the function does not exist it +is added first, as though with `new`. + +### `intrinsic(module_obj, intrinsic_id, types)` + +Create and return a `Function` object that refers to an intrinsic +function, as described [here](functions.html#intrinsic). + +## Properties + +### `calling_convention` + +The calling convention for the function, +as listed [here](functions.html#callconv). + +### `collector` + +A string holding the name of the garbage collection algorithm. +See [LLVM docs](http://www.llvm.org/docs/LangRef.html#gc). + +### `does_not_throw` + +Setting to True sets the `ATTR_NO_UNWIND` attribute, False +removes it. Shortcut to using `f.add_attribute(ATTR_NO_UNWIND)` +and `f.remove_attribute(ATTR_NO_UNWIND)`. + +### `args` + +\[read-only\] + +List of [llvm.core.Argument][] objects representing the formal +arguments of the function. + +### `basic_block_count` + +\[read-only\] + +Number of basic blocks belonging to this function. Same as +`len(f.basic_blocks)` but faster if you just want the count. + +### `entry_basic_block` + +\[read-only\] + +The [llvm.core.BasicBlock][] object representing the entry +basic block for this function, or `None` if there are no +basic blocks. + +### `basic_blocks` + +\[read-only\] + +List of [llvm.core.BasicBlock][] objects representing the +basic blocks belonging to this function. + +### `intrinsic_id` + +\[read-only\] + +Returns the ID of the intrinsic if this object represents an +intrinsic instruction. Otherwise 0. + +## Methods + +### `delete()` + +Deletes the function from it's module. Do not hold any +references to this object after calling `delete` on it. + +### `append_basic_block(name)` + +Add a new basic block named `name`, and return a corresponding +[llvm.core.BasicBlock][] object. Note that if this is not the +entry basic block, you'll have to add appropriate branch +instructions from other basic blocks yourself. + +### `add_attribute(attr)` + +Add an attribute `attr` to the function, from the set listed above. + +### `remove_attribute(attr)` + +Remove the attribute `attr` of the function. + +### `viewCFG()` + +Displays the control flow graph using the GraphViz tool. + +### `viewCFGOnly()` + +Displays the control flow graph using the GraphViz tool, but +omitting function bodies. + +### `verify()` + +Verifies the function. See [LLVM docs](http://llvm.org/docs/Passes.html#verify). + +[llvm.core.Argument]: llvm.core.Argument.html +[llvm.core.BasicBlock]: llvm.core.BasicBlock.html diff --git a/docs/source/doc/llvm.core.FunctionType.md b/docs/source/doc/llvm.core.FunctionType.md new file mode 100644 index 0000000..26cf40b --- /dev/null +++ b/docs/source/doc/llvm.core.FunctionType.md @@ -0,0 +1,53 @@ +--- +layout: page +title: FunctionType (llvm.core) +--- + +# llvm.core.FunctionType + +## Base Class + +- [llvm.core.Type](llvm.core.Type.html) + +## Properties + +### `return_type` + + +\[read-only\] + +A [Type][llvm.core.Type] object, representing the return type of the function. + +### `vararg` + + +\[read-only\] + +`True` if the function is variadic. + +### `args` + + +\[read-only\] + +Returns an iterable object that yields [Type][llvm.core.Type] objects that +represent, in order, the types of the arguments accepted by the +function. Used like this: + +{% highlight python %} +func_type = Type.function( Type.int(), [ Type.int(), Type.int() ] ) +for arg in func_type.args: + assert arg.kind == TYPE_INTEGER + assert arg == Type.int() +assert func_type.arg_count == len(func_type.args) +{% endhighlight %} + + +### `arg_count` + + +\[read-only\] + +The number of arguments. Same as `len(obj.args)`, but faster. + +[llvm.core.Type]: llvm.core.Type.html diff --git a/docs/source/doc/llvm.core.GlobalValue.md b/docs/source/doc/llvm.core.GlobalValue.md new file mode 100644 index 0000000..5d46a2e --- /dev/null +++ b/docs/source/doc/llvm.core.GlobalValue.md @@ -0,0 +1,99 @@ +--- +layout: page +title: GlobalValue (llvm.core) +--- + +The class `llvm.core.GlobalValue` represents module-scope aliases, variables +and functions. Global variables are represented by the sub-class +[llvm.core.GlobalVariable][] and functions by [llvm.core.Function][]. + +Global values have the read-write properties `linkage`, `section`, +`visibility` and `alignment`. Use one of the following constants (from +llvm.core) as values for `linkage` +(see [LLVM documentaion](http://www.llvm.org/docs/LangRef.html#linkage) for +details on each): + + +Value | Equivalent LLVM Assembly Keyword | +------|----------------------------------| +`LINKAGE_EXTERNAL` | `externally_visible` | +`LINKAGE_AVAILABLE_EXTERNALLY` | `available_externally` | +`LINKAGE_LINKONCE_ANY` | `linkonce` | +`LINKAGE_LINKONCE_ODR` | `linkonce_odr` | +`LINKAGE_WEAK_ANY` | `weak` | +`LINKAGE_WEAK_ODR` | `weak_odr` | +`LINKAGE_APPENDING` | `appending` | +`LINKAGE_INTERNAL` | `internal` | +`LINKAGE_PRIVATE` | `private` | +`LINKAGE_DLLIMPORT` | `dllimport` | +`LINKAGE_DLLEXPORT` | `dllexport` | +`LINKAGE_EXTERNAL_WEAK` | `extern_weak` | +`LINKAGE_GHOST` | deprecated -- do not use | +`LINKAGE_COMMON` | `common` | +`LINKAGE_LINKER_PRIVATE` | `linker_private` | + +
+ +The `section` property can be assigned strings (like ".rodata"), which +will be used if the target supports it. Visibility property can be set +to one of thse constants (from llvm.core, see also +[LLVM docs](http://www.llvm.org/docs/LangRef.html#visibility)): + + +Value | Equivalent LLVM Assembly Keyword | +------|----------------------------------| +`VISIBILITY_DEFAULT` | `default` | +`VISIBILITY_HIDDEN` | `hidden` | +`VISIBILITY_PROTECTED` | `protected` | + +
+ + +The `alignment` property can be 0 (default), or can be set to a power of 2. +The read-only property `is_declaration` can be used to check if the +global is a declaration or not. The module to which the global belongs +to can be retrieved using the `module` property (read-only). + +# llvm.core.GlobalValue + + +* This will become a table of contents (this text will be scraped). +{:toc} + + +## Base Class + +- [llvm.core.Constant](llvm.core.Constant.html) + +## Properties + +### `linkage` + +The linkage type, takes one of the constants listed above (LINKAGE_\*). + +### `section` + +A string like ".rodata", indicating the section into which the +global is placed into. + +### `visibility` + +The visibility type, takes one of the constants listed above (VISIBILITY_\*). + +### `alignment` + +A power-of-2 integer indicating the boundary to align to. + +### `is_declaration` +\[read-only\] + +`True` if the global is a declaration, `False` otherwise. + +### `module` +\[read-only\] + + The module object to which this global belongs to. + +[llvm.core.GlobalVariable]: llvm.core.GlobalVariable.html +[llvm.core.Function]: llvm.core.Function.html + diff --git a/docs/source/doc/llvm.core.GlobalVariable.md b/docs/source/doc/llvm.core.GlobalVariable.md new file mode 100644 index 0000000..05792a0 --- /dev/null +++ b/docs/source/doc/llvm.core.GlobalVariable.md @@ -0,0 +1,94 @@ +--- +layout: page +title: GlobalVariable (llvm.core) +--- + +Global variables (`llvm.core.GlobalVariable`) are subclasses of +[llvm.core.GlobalValue][] and represent module-level variables. These can +have optional initializers and can be marked as constants. Global +variables can be created either by using the `add_global_variable` +method of the [Module][llvm.core.Module] class, or by using the static method +`GlobalVariable.new`. + +{% highlight python %} +# create a global variable using add_global_variable method +gv1 = module_obj.add_global_variable(Type.int(), "gv1") + +# or equivalently, using a static constructor method +gv2 = GlobalVariable.new(module_obj, Type.int(), "gv2") +{% endhighlight %} + +Existing global variables of a module can be accessed by name using +`module_obj.get_global_variable_named(name)` or `GlobalVariable.get`. +All existing global variables can be enumerated via iterating over the +property `module_obj.global_variables`. + +{% highlight python %} +# retrieve a reference to the global variable gv1, +# using the get_global_variable_named method +gv1 = module_obj.get_global_variable_named("gv1") + +# or equivalently, using the static `get` method: +gv2 = GlobalVariable.get(module_obj, "gv2") + +# list all global variables in a module +for gv in module_obj.global_variables: + print gv.name, "of type", gv.type +{% endhighlight %} + +The initializer for a global variable can be set by assigning to the +`initializer` property of the object. The `is_global_constant` property +can be used to indicate that the variable is a global constant. + +Global variables can be delete using the `delete` method. Do not use the +object after calling `delete` on it. + +{% highlight python %} +# add an initializer 10 (32-bit integer) +gv.initializer = Constant.int( Type.int(), 10 ) + +# delete the global +gv.delete() +# DO NOT dereference `gv' beyond this point! +gv = None +{% endhighlight %} + +# llvm.core.GlobalVariable +## Base Class + +- [llvm.core.GlobalValue](llvm.core.GlobalValue.html) + +## Static Constructors + +### `new(module_obj, ty, name)` + +Create a global variable named `name` of type `ty` in the module +`module_obj` and return a `GlobalVariable` object that represents it. + +### `get(module_obj, name)` + +Return a `GlobalVariable` object to represent the global variable +named `name` in the module `module_obj` or raise `LLVMException` if +such a variable does not exist. + +## Properties + +### `initializer` + +The intializer of the variable. Set to [llvm.core.Constant][] (or +derived). Gets the initializer constant, or `None` if none exists. +`global_constant` +`True` if the variable is a global constant, `False` otherwise. + +## Methods + +### `delete()` +Deletes the global variable from it's module. +**Do not hold any references to this object after calling `delete` on it.** + + +[llvm.core.Module]: llvm.core.Module.html +[llvm.core.Constant]: llvm.core.Constant.html +[llvm.core.GlobalValue]: llvm.core.GlobalValue.html + + diff --git a/docs/source/doc/llvm.core.Instruction.md b/docs/source/doc/llvm.core.Instruction.md new file mode 100644 index 0000000..cef3041 --- /dev/null +++ b/docs/source/doc/llvm.core.Instruction.md @@ -0,0 +1,215 @@ +--- +layout: page +title: Instruction (llvm.core) +--- + +An `llvm.core.Instruction` object represents an LLVM instruction. This +class is the root of a small hierarchy: + + + Instruction + CallOrInvokeInstruction + PHINode + SwitchInstruction + CompareInstruction + + +Instructions are not created directly, but via a builder. The builder +both creates instructions and adds them to a basic block at the same +time. One way of getting instruction objects are from basic blocks. + +Being derived from [llvm.core.User][], the instruction +is-a user, i.e., an instruction in turn uses other values. The values +an instruction uses are its operands. These may be accessed using +`operands` property from the [llvm.core.User][] base. + +The name of the instruction (like `add`, `mul` etc) can be got +via the `opcode_name` property. The `basic_block` property gives +the basic block to which the instruction belongs to. Note that +llvm-py does not allow free-standing instruction objects (i.e., +all instructions are created contained within a basic block). + +Classes of instructions can be got via the properties +`is_terminator`, `is_binary_op`, `is_shift` etc. See below for +the full list. + + + +* This will become a table of contents (this text will be scraped). +{:toc} + + +# llvm.core.Instruction + +## Base Class + +- [llvm.core.User](llvm.core.User.html) + +## Properties + +### `basic_block` +\[read-only\] +The basic block to which this instruction belongs to. + +### `is_terminator` +\[read-only\] +True if the instruction is a terminator instruction. + +### `is_binary_op` +\[read-only\] +True if the instruction is a binary operator. + +### `is_shift` +\[read-only\] +True if the instruction is a shift instruction. + +### `is_cast` +\[read-only\] +True if the instruction is a cast instruction. + +### `is_logical_shift` +\[read-only\] +True if the instruction is a logical shift instruction. + +### `is_arithmetic_shift` +\[read-only\] +True if the instruction is an arithmetic shift instruction. + +### `is_associative` +\[read-only\] +True if the instruction is associative. + +### `is_commutative` +\[read-only\] +True if the instruction is commutative. + +### `is_volatile` +\[read-only\] +True if the instruction is a volatile load or store. + +### `opcode` +\[read-only\] +The numeric opcode value of the instruction. Do not rely +on the absolute value of this number, it may change with +LLVM version. + +### `opcode_name` +\[read-only\] +The name of the instruction, like `add`, `sub` etc. + +* * * + +# llvm.core.CallOrInvokeInstruction + +The `llvm.core.CallOrInvokeInstruction` is a subclass of +`llvm.core.Instruction`, and represents either a `call` or an +`invoke` instruction. + +## Base Class + +- `llvm.core.Instruction` + +## Properties + +`calling_convention` + Get or set the calling convention. See [here](functions.html#callconv) + for possible values. + +## Methods + +### `add_parameter_attribute(idx, attr)` + +Add an attribute `attr` to the `idx`-th argument. See +[here](llvm.core.Argument.html) for possible values of `attr`. + +### `remove_parameter_attribute(idx, attr)` + +Remove an attribute `attr` from the `idx`-th argument. See +[here](llvm.core.Argument.html) for possible values of `attr`. + +### `set_parameter_alignment(idx, align)` + +Set the alignment of the `idx`-th argument to `align`. +`align` should be a power of two. + +* * * + +# llvm.core.PHINode + +The `llvm.core.PHINode` is a subclass of +`llvm.core.Instruction`, and represents the `phi` instruction. When +created (using `Builder.phi`) the phi node contains no incoming +blocks (nor their corresponding values). To add an incoming arc to +the phi node, use the `add_incoming` method, which takes a source +block ([llvm.core.BasicBlock][] object) and a value (object of +[llvm.core.Value][] or of a class derived from it) that the phi node +will take on if control branches in from that block. + + +## Base Class + +- `llvm.core.Instruction` + +## Properties +`incoming_count` +\[read-only\] + The number of incoming arcs for this phi node. + +## Methods + +### `add_incoming(value, block)` + +Add an incoming arc, from the [llvm.core.BasicBlock][] object +`block`, with the corresponding value `value`. `value` should +be an object of [llvm.core.Value][] (or of a descendent class). + +### `get_incoming_value(idx)` + +Returns the `idx`-th incoming arc's value. + +### `get_incoming_block(idx)` + +Returns the `idx`-th incoming arc's block. + + +# llvm.core.SwitchInstruction # {#switchinstr} + +(TODO describe) + +## Base Class + +- `llvm.core.Instruction` + +## Methods + +### `add_case(const, block)` +Add another case to the switch statement. When the expression +being evaluated equals `const`, then control branches to +`block`. Here `const` must be of type +[llvm.core.ConstantInt][llvm.core.Constant]. + +* * * + +# llvm.core.CompareInstruction + +(TODO describe) + +## Base Class + +- `llvm.core.Instruction` + +## Properties + +###`predicate` +\[read-only\] + +The predicate of the compare instruction, one of the `ICMP_*` or +`FCMP_*` constants. + + +[llvm.core.User]: llvm.core.User.html +[llvm.core.BasicBlock]: llvm.core.BasicBlock.html +[llvm.core.Value]: llvm.core.Value.html +[llvm.core.Constant]: llvm.core.Constant.html + + diff --git a/docs/source/doc/llvm.core.IntegerType.md b/docs/source/doc/llvm.core.IntegerType.md new file mode 100644 index 0000000..da62f5c --- /dev/null +++ b/docs/source/doc/llvm.core.IntegerType.md @@ -0,0 +1,20 @@ +--- +layout: page +title: IntegerType (llvm.core) +--- + +# llvm.core.IntegerType + +## Base Class + +- [llvm.core.Type](llvm.core.Type.html) + +## Properties + + +### `width` +\[read-only\] + +The width of the integer type, in number of bits. + + diff --git a/docs/source/doc/llvm.core.Module.md b/docs/source/doc/llvm.core.Module.md new file mode 100644 index 0000000..80a4784 --- /dev/null +++ b/docs/source/doc/llvm.core.Module.md @@ -0,0 +1,219 @@ +--- +layout: page +title: Module (llvm.core) +--- + +Modules are top-level container objects. You need to create a module +object first, before you can add global variables, aliases or functions. +Modules are created using the static method `Module.new`: + +{% highlight python %} +#!/usr/bin/env python + +from llvm import * +from llvm.core import * + +# create a module +my_module = Module.new('my_module') +{% endhighlight %} + +The constructor of the Module class should _not_ be used to instantiate +a Module object. This is a common feature for all llvm-py classes. + +> **Convention** +> +> *All* llvm-py objects are instantiated using static methods of +> corresponding classes. Constructors _should not_ be used. +> +> The argument `my_module` is a module identifier (a plain string). A +> module can also be constructed via deserialization from a bit code file, +> using the static method `from_bitcode`. This method takes a file-like +> object as argument, i.e., it should have a `read()` method that returns +> the entire data in a single call, as is the case with the builtin file +> object. Here is an example: + +{% highlight python %} +# create a module from a bit code file +bcfile = file("test.bc") +my_module = Module.from_bitcode(bcfile) +{% endhighlight %} + +There is corresponding serialization method also, called `to_bitcode`: + +{% highlight python %} +# write out a bit code file from the module +bcfile = file("test.bc", "w") +my_module.to_bitcode(bcfile) +{% endhighlight %} + +Modules can also be constructed from LLVM assembly files (`.ll` files). +The static method `from_assembly` can be used for this. Similar to the +`from_bitcode` method, this one also takes a file-like object as +argument: + +{% highlight python %} +# create a module from an assembly file +llfile = file("test.ll") +my_module = Module.from_assembly(llfile) +{% endhighlight %} + +Modules can be converted into their assembly representation by +stringifying them (see below). + +* * * + + +# llvm.core.Module + +* This will become a table of contents (this text will be scraped). +{:toc} + +## Static Constructors + +### `new(module_id)` + +Create a new `Module` instance with given `module_id`. The `module_id` +should be a string. + +### `from_bitcode(fileobj)` + +Create a new `Module` instance by deserializing the bitcode file +represented by the file-like object `fileobj`. + +### `from_assembly(fileobj)` + +Create a new `Module` instance by parsing the LLVM assembly file +represented by the file-like object `fileobj`. + +## Properties + +### `data_layout` + +A string representing the ABI of the platform. + +### `target` + +A string like `i386-pc-linux-gnu` or `i386-pc-solaris2.8`. + +### `pointer_size` +\[read-only\] + +The size in bits of pointers, of the target platform. A value of +zero represents `llvm::Module::AnyPointerSize`. + +### `global_variables` +\[read-only\] + +An iterable that yields [GlobalVariable][llvm.core.GlobalVariable] objects, +that represent the global variables of the module. + +### `functions` +\[read-only\] + +An iterable that yields [Function][llvm.core.Function] objects, +that represent functions in the module. + +### `id` + +A string that represents the module identifier (name). + +## Methods + +### `get_type_named(name)` + +Return a [StructType][llvm.core.StructType] object for the given name. + +The definition of this method was changed to work with LLVM 3.0+, in which +the type system was rewritten. +See [LLVM Blog](http://blog.llvm.org/2011/11/llvm-30-type-system-rewrite.html). + +{% comment %} +++++++++REMOVED+++++++++++ +### `add_type_name(name, ty)` + +Add an alias (typedef) for the type `ty` with the name `name`. + +### `delete_type_name(name)` + +Delete an alias with the name `name`. +++++++++END-REMOVED+++++++++++ +{% endcomment %} + +### `add_global_variable(ty, name)` + +Add a global variable of the type `ty` with the name `name`. +Returns a [GlobalVariable][llvm.core.GlobalVariable] object. + +### `get_global_variable_named(name)` + +Get a [GlobalVariable][llvm.core.GlobalVariable] object corresponding to +the global variable with the name `name`. +Raises `LLVMException` if such a variable does not exist. + +### `add_library(name)` + +Add a dependent library to the Module. This only adds a name to a list of +dependent library. **No linking is performed**. + +### `add_function(ty, name)` + +Add a function named `name` with the function type `ty`. `ty` must +of an object of type [FunctionType][llvm.core.FunctionType]. + +### `get_function_named(name)` + +Get a [Function][llvm.core.Function] object corresponding to the function with +the name `name`. Raises `LLVMException` if such a function does not exist. + +### `get_or_insert_function(ty, name)` + +Like `get_function_named`, but adds the function first, if not +present (like `add_function`). + +### `verify()` + +Verify the correctness of the module. Raises `LLVMException` on +errors. + +### `to_bitcode(fileobj)` + +Write the bitcode representation of the module to the file-like +object `fileobj`. + +### `link_in(other)` + +Link in another module `other` into this module. Global variables, +functions etc. are matched and resolved. The `other` module is no +longer valid and should not be used after this operation. This API +might be replaced with a full-fledged Linker class in the future. + +## Special Methods + + +### `__str__` + +`Module` objects can be stringified into it's LLVM assembly language +representation. + +### `__eq__` + +`Module` objects can be compared for equality. Internally, this +converts both arguments into their LLVM assembly representations and +compares the resultant strings. + +> **Convention** +> +> *All* llvm-py objects (where it makes sense), when stringified, return +> the LLVM assembly representation. `print module_obj` for example, +> prints the LLVM assembly form of the entire module. +> +> Such objects, when compared for equality, internally compare these +> string representations. + +[llvm.core.Function]: llvm.core.Function.html +[llvm.core.FunctionType]: llvm.core.FunctionType.html +[llvm.core.GlobalVariable]: llvm.core.GlobalVariable.html +[llvm.core.BasicBlock]: llvm.core.BasicBlock.html +[llvm.core.Type]: llvm.core.Type.html +[llvm.core.StructType]: llvm.core.StructType.html + diff --git a/docs/source/doc/llvm.core.PointerType.md b/docs/source/doc/llvm.core.PointerType.md new file mode 100644 index 0000000..7adfbe6 --- /dev/null +++ b/docs/source/doc/llvm.core.PointerType.md @@ -0,0 +1,26 @@ +--- +layout: page +title: PointerType (llvm.core) +--- + + +# llvm.core.PointerType + +## Base Class + +- [llvm.core.Type](llvm.core.Type.html) + +## Properties + + +### `address_space` +\[read-only\] + +The address space of the pointer. + + +### `pointee` +\[read-only\] + +A [Type](llvm.core.Type.html) object representing the type of the value pointed to. + diff --git a/docs/source/doc/llvm.core.StructType.md b/docs/source/doc/llvm.core.StructType.md new file mode 100644 index 0000000..5156814 --- /dev/null +++ b/docs/source/doc/llvm.core.StructType.md @@ -0,0 +1,70 @@ +--- +layout: page +title: StructType (llvm.core) +--- + +# llvm.core.StructType + +## Base Class + +- [llvm.core.Type](llvm.core.Type.html) + +## Methods + +### `set_body(self, elems, packed=False)` + +Define the body for opaque identified structure. + +`elems` is an iterable of [llvm.core.Type](llvm.core.Type.html) +If `packed` is `True`, creates a packed structure. + +## Properties + +### `is_identified` +\[read-only\] + +`True` if this is an identified structure. + +### `is_literal` +\[read-only\] + +`True` if this is a literal structure. + +### `is_opaque` +\[read-only\] + +`True` if this is an opaque structure. +Only identified structure can be opaque. + +### `packed` +\[read-only\] + +`True` if the structure is packed (no padding between elements). + +### `name` + +Use in identified structure. +If set to empty, the identified structure is removed from the global context. + +### `elements` +\[read-only\] + +Returns an iterable object that yields [Type](llvm.core.Type.html) objects that +represent, in order, the types of the elements of the structure. +Used like this: + +{% highlight python %} +struct_type = Type.struct( [ Type.int(), Type.int() ] ) +for elem in struct_type.elements: + assert elem.kind == TYPE_INTEGER + assert elem == Type.int() +assert struct_type.element_count == len(struct_type.elements) +{% endhighlight %} + + +### `element_count` +\[read-only\] + +The number of elements. Same as `len(obj.elements)`, but faster. + + diff --git a/docs/source/doc/llvm.core.Type.md b/docs/source/doc/llvm.core.Type.md new file mode 100644 index 0000000..2616b7e --- /dev/null +++ b/docs/source/doc/llvm.core.Type.md @@ -0,0 +1,148 @@ +--- +layout: page +title: Type (llvm.core) +--- + +# llvm.core.Type + +* This will become a table of contents (this text will be scraped). +{:toc} + +## Static Constructors + +### `int(n)` + +Create an integer type of bit width `n`. + +### `float()` + +Create a 32-bit floating point type. + + +### `double()` + +Create a 64-bit floating point type. + + +### `x86_fp80()` + +Create a 80-bit 80x87-style floating point type. + + +### `fp128()` + +Create a 128-bit floating point type (112-bit mantissa). + + +### `ppc_fp128()` + +Create a 128-bit float (two 64-bits). + + +### `function(ret, params, vararg=False)` + +Create a function type, having the return type `ret` (must be a +`Type`), accepting the parameters `params`, where `params` is an +iterable, that yields `Type` objects representing the type of +each function argument in order. If `vararg` is `True`, function is +variadic. + + +### `struct(eltys, name='')` + +Create an unpacked structure. `eltys` is an iterable, that yields +`Type` objects representing the type of each element in order. + +If `name` is evaulates `True` (not empty), create +an *identified structure*; otherwise, create a *literal structure* +by default. + + +### `packed_struct(eltys, name='')` + +Like `struct(eltys)`, but creates a packed struct. + + +### `array(elty, count)` + +Creates an array type, holding `count` elements, each of type `elty` +(which should be a `Type`). + + +### `pointer(pty, addrspc=0)` + +Create a pointer to type `pty` (which should be a `Type`). `addrspc` +is an integer that represents the address space of the pointer (see +LLVM docs or ask on llvm-dev for more info). + + +### `void()` + +Creates a void type. Used for function return types. + + +### `label()` + +Creates a label type. + + +### `opaque(name)` + +Opaque [StructType](llvm.core.StructType.html), used for creating self-referencing types. + +## Properties + + +### `kind` +\[read-only\] + +A value (enum) representing the "type" of the object. It will be +one of the following constants defined in `llvm.core`: + +{% highlight python %} +# Warning: do not rely on actual numerical values! +TYPE_VOID = 0 +TYPE_FLOAT = 1 +TYPE_DOUBLE = 2 +TYPE_X86_FP80 = 3 +TYPE_FP128 = 4 +TYPE_PPC_FP128 = 5 +TYPE_LABEL = 6 +TYPE_INTEGER = 7 +TYPE_FUNCTION = 8 +TYPE_STRUCT = 9 +TYPE_ARRAY = 10 +TYPE_POINTER = 11 +TYPE_OPAQUE = 12 +TYPE_VECTOR = 13 +TYPE_METADATA = 14 +TYPE_UNION = 15 +{% endhighlight %} + +#### Example: +{% highlight python %} +assert Type.int().kind == TYPE_INTEGER +assert Type.void().kind == TYPE_VOID +{% endhighlight %} + +## Methods + +### `refine` + +Used for constructing self-referencing types. See the documentation +of [TypeHandle](llvm.core.TypeHandle.html) objects. + +## Special Methods + +### `__str__` + +`Type` objects can be stringified into it's LLVM assembly language +representation. + +### `__eq__` + +`Type` objects can be compared for equality. Internally, this +converts both arguments into their LLVM assembly representations and +compares the resultant strings. + + diff --git a/docs/source/doc/llvm.core.User.md b/docs/source/doc/llvm.core.User.md new file mode 100644 index 0000000..8a2353f --- /dev/null +++ b/docs/source/doc/llvm.core.User.md @@ -0,0 +1,34 @@ +--- +layout: page +title: User (llvm.core) +--- + +`User`-s are values that refer to other values. The values so refered +can be retrived by the properties of `User`. This is the reverse of +the `Value.uses`. Together these can be used to traverse the use-def +chains of the SSA. + +* * * + + +# llvm.core.User # {#user} +## Base Class +- [llvm.core.Value][] + +## Properties + +### `operands` +\[read-only\] + +The list of operands (values, of type [llvm.core.Value][]) that this +value refers to. + +### `operand_count` +\[read-only\] + +The number of operands that this value referes to. Same as +`len(uses.operands)` but faster if you just want the count. + + +[llvm.core.Value]: llvm.core.Value.html + diff --git a/docs/source/doc/llvm.core.Value.md b/docs/source/doc/llvm.core.Value.md new file mode 100644 index 0000000..7c0555e --- /dev/null +++ b/docs/source/doc/llvm.core.Value.md @@ -0,0 +1,50 @@ +--- +layout: page +title: Value (llvm.core) +--- + +# llvm.core.Value + +* This will become a table of contents (this text will be scraped). +{:toc} + +## Properties +### `name` + +The name of the value. + +### `type` +\[read-only\] + +An `llvm.core.Type` object representing the type of the value. + +### `uses` +\[read-only\] + +The list of values (`llvm.core.Value`) that use this value. + +### `use_count` +\[read-only\] + +The number of values that use (refer) this value. Same as `len(val.uses)` +but faster if you just want the count. + +### `value_id` +\[read-only\] + +Returns `llvmValuegetValueID()`. Refer LLVM documentation +for more info. + +## Special Methods + +### `__str__` + +`Value` objects can be stringified into it's LLVM assembly language +representation. + +### `__eq__` + +`Value` objects can be compared for equality. Internally, this +converts both arguments into their LLVM assembly representations and +compares the resultant strings. + diff --git a/docs/source/doc/llvm.core.VectorType.md b/docs/source/doc/llvm.core.VectorType.md new file mode 100644 index 0000000..b227077 --- /dev/null +++ b/docs/source/doc/llvm.core.VectorType.md @@ -0,0 +1,26 @@ +--- +layout: page +title: VectorType (llvm.core) +--- + +# llvm.core.VectorType + +## Base Class + +- [llvm.core.Type](llvm.core.Type.html) + +## Properties + + +### `element` +\[read-only\] + +A [Type](llvm.core.Type.html) object representing the type of the element of +the vector. + + +### `count` +\[read-only\] + +The number of elements in the vector. + diff --git a/docs/source/doc/llvm.ee.EngineBuilder.md b/docs/source/doc/llvm.ee.EngineBuilder.md new file mode 100644 index 0000000..a7f6c36 --- /dev/null +++ b/docs/source/doc/llvm.ee.EngineBuilder.md @@ -0,0 +1,49 @@ +--- +layout: page +title: EngineBuilder (llvm.ee) +--- + +# llvm.ee.EngineBuilder + +A convenient class for building [llvm.ee.ExecutionEngine]. +Each `EngineBuilder` instance can only create one `ExecutionEngine`. + + +## Methods + +### `create(self)` + +Create and return a new [ExecutionEngine][llvm.ee.ExecutionEngine] instance. + +Raise `llvm.LLVMException` if the builder cannot create an `ExecutionEngine` +base on the given configuration. + +### `force_interpreter(self)` + +Force the output the output `ExecutionEngine` to be an LLVM IR interpreter. + +### `force_jit(self)` + +Force the output the output `ExecutionEngine` to be a JIT engine. + +### `opt(self, level)` + +Set the code generation optimization level for a JIT engine. +Valid value of `level` is 0-3, inclusive. +The default setting is 2. +To use vector instructions, such as SSE on Intel processors, +`level` must be 3 (aggressive). + +## Static Factory Methods + +### `new(module)` + +Create a new EngineBuilder. `module` must be a [llvm.core.Module][] instance. +Its ownership is transferred to the resulting +[ExecutionEngine][llvm.ee.ExecutionEngine]. +Therefore, it is impossible to create more than one `ExecutionEngine` with +a single `EngineBuilder` + + +[llvm.core.Module]: llvm.core.Module.html +[llvm.ee.ExecutionEngine]: llvm.ee.ExecutionEngine.html diff --git a/docs/source/doc/llvm.ee.ExecutionEngine.md b/docs/source/doc/llvm.ee.ExecutionEngine.md new file mode 100644 index 0000000..39ef661 --- /dev/null +++ b/docs/source/doc/llvm.ee.ExecutionEngine.md @@ -0,0 +1,52 @@ +--- +layout: page +title: ExecutionEngine (llvm.ee) +--- + +# llvm.ee.ExecutionEngine + +## Methods + +### `add_module(self, module)` + +Add a new module to the ExecutionEngine. +The ownership is of `module` is transferred. +When the `ExecutionEngine` is destroyed, the module is destroyed. + +### `free_machine_code_for(self, fn)` + +Release memory used for the machine code generated for +the function `fn`. + +### `get_pointer_to_function(self, fn)` + +Obtain the pointer to the function `fn`. +This forces the ExecutionEngine to generate the machine code +in lazy mode. + +If `fn` is not defined, `ExecutionEngine` will lookup the +symbol through `dlsym`. + +The returned function pointer can be wrapped as a `ctypes` function. + +### `remove_module(self, module)` + +Remove the `module`. + +### `run_function(self, fn, args)` + +Execute the function `fn` with an iterable of arguments `args` +which are of `GenericValue`. This method returns whatever +that is returned by `fn` as a `GenericValue`. + +### `run_static_ctors(self)` + +### `run_static_dtors(self)` + +## Properties + +### `target_data` + +Access the [TargetData](llvm.ee.TargetData.html) +instance associated with the `ExecutionEngine`. + diff --git a/docs/source/doc/llvm.ee.GenericValue.md b/docs/source/doc/llvm.ee.GenericValue.md new file mode 100644 index 0000000..eca6975 --- /dev/null +++ b/docs/source/doc/llvm.ee.GenericValue.md @@ -0,0 +1,55 @@ +--- +layout: page +title: GenericValue (llvm.ee) +--- + +# llvm.ee.GenericValue + +* This will become a table of contents (this text will be scraped). +{:toc} + +## Methods + +### `as_int(self)` + +Return the value of this `GenericValue` instance as an unsigned integer + +### `as_int_signed(self)` + +Return the value of this `GenericValue` instance as a signed integer. + +### `as_pointer(self)` + +Return the value of this `GenericValue` instance as a pointer. +The type of the return value is `int`. + +### `as_real(self, ty)` + +Return the value of this `GenericValue` instance as a real number +which type is specified by `ty`. `ty` must be a [Type][] instance +of a real number type. + +## Static Factory Methods + +### `int(ty, intval)` + +Create a `GenericValue` instance with a `int` value, +which is zero-extended if necessary. +The type of the value is specified by `ty`, which is a [Type][] instance. + +### `int_signed(ty, intval)` + +Create a `GenericValue` instance with a `int` value, +which is sign-extended if necessary. +The type of the value is specified by `ty`, which is a [Type][] instance. + +### `pointer(ty, addr)` or `pointer(addr)` + +Create a `GenericValue` instance with a `int` value, +which is representing a pointer value. + +The two argument version is **deprecated**. +The old code never used `ty` anyway. + + +[Type]: llvm.core.Type.html diff --git a/docs/source/doc/llvm.ee.TargetData.md b/docs/source/doc/llvm.ee.TargetData.md new file mode 100644 index 0000000..537d680 --- /dev/null +++ b/docs/source/doc/llvm.ee.TargetData.md @@ -0,0 +1,48 @@ +--- +layout: page +title: TargetData (llvm.ee) +--- + +# llvm.ee.TargetData + +* This will become a table of contents (this text will be scraped). +{:toc} + +## Methods + +### `abi_alignment(self, ty)` + +Returns the minimum ABI-required alignment for the specified type `ty`. + +### `abi_size(self, ty)` + +### `callframe_alignment(self, ty)` + +Returns the minimum ABI-required alignment for the specified type `ty` when it is part of a call frame. + +### `element_at_offset(self, ty, ofs)` + +### `offset_of_element(self, ty, el)` + +### `preferred_alignment(self, ty_or_gv)` + +### `size(self, ty)` + +### `store_size(self, ty)` + +### `__str__(self)` +Returns the string representation. + +## Static Factory Methods + +### `new(strrep)` +Construct a new `TargetData` instance from the string representation + +## Properties + +### `byte_order` + +### `pointer_size` + +### `target_integer_type` + diff --git a/docs/source/doc/llvm.passes.FunctionPassManager.md b/docs/source/doc/llvm.passes.FunctionPassManager.md new file mode 100644 index 0000000..e1bc068 --- /dev/null +++ b/docs/source/doc/llvm.passes.FunctionPassManager.md @@ -0,0 +1,35 @@ +--- +layout: page +title: FunctionPassManager (llvm.passes) +--- + +# llvm.passes.FunctionPassManager + +## Base Classes + +- [llvm.passes.PassManager](llvm.passes.PassManager.html) + +## Methods + +### `finalize(self)` + +Finalizes all associated function passes in the LLVM system. + +Beware that this destroys all associated passes even if another +pass manager is using those passes. This may result is a segfault. + +### `initialize(self)` + +Initializes all associated function passes in the LLVM system. + +### `run(self, fn)` + +Run all passes on the given function `fn`. + +## Static Factory Methods + +### `new(module)` + +Create a `FunctionPassManager` instance for a given `module`. + + diff --git a/docs/source/doc/llvm.passes.PassManager.md b/docs/source/doc/llvm.passes.PassManager.md new file mode 100644 index 0000000..1883516 --- /dev/null +++ b/docs/source/doc/llvm.passes.PassManager.md @@ -0,0 +1,23 @@ +--- +layout: page +title: PassManager (llvm.passes) +--- + +# llvm.passes.PassManager + +## Methods + +### `add(self, tgt_data_or_pass_id)` + +Add a pass by its ID. A pass IDs are defined as `PASS_*`. + +### `run(self, module)` + +Run all passes on the given `module`. + +## Static Factory Methods + +### `new()` + +Creates a new `PassManager` instance. + diff --git a/docs/source/doc/llvm.passes.PassManagerBuilder.md b/docs/source/doc/llvm.passes.PassManagerBuilder.md new file mode 100644 index 0000000..7992de4 --- /dev/null +++ b/docs/source/doc/llvm.passes.PassManagerBuilder.md @@ -0,0 +1,59 @@ +--- +layout: page +title: PassManagerBuilder (llvm.passes) +--- + +# llvm.passes.PassManagerBuilder + +Provide a simple API to populate pass managers for language like C/C++. +Refer to +[LLVM API Documentation](http://llvm.org/docs/doxygen/html/classllvm_1_1PassManagerBuilder.html) +for detail. + +## Methods + +### `populate(self, pm)` + +Populate a [FunctionPassManager](llvm.passes.FunctionPassManager.html) +or [PassManager](llvm.passes.PassManager.html) given as `pm`. + +### `use_inliner_with_threshold(self, threshold)` + +Use an inliner pass with the given `threshold`. + +## Properties + +The following properties can be overriden to customize how pass managers +are populated. + +### `disable_simplify_lib_calls` + +Boolean. Default is `False`. + +### `disable_unit_at_a_time` + +Boolean. Default is `False`. + +### `disable_unroll_loops` + +Boolean. Default is `False`. + +### `opt_level` + +Default is `2`. Valid values are 0-3. Corresponds to O0, O1, O2, O3 as in C/C++ +optimization options. + +### `size_level` + +Default is `0`. + +### `vectorize` + +Default is `False`. + +## Static Factory Methods + +### `new()` + +Creates a new `PassManagerBuilder` instance. + diff --git a/docs/source/doc/llvm_concepts.md b/docs/source/doc/llvm_concepts.md new file mode 100644 index 0000000..acd27ce --- /dev/null +++ b/docs/source/doc/llvm_concepts.md @@ -0,0 +1,242 @@ +--- +layout: page +title: LLVM Concepts +--- + +This section explains a few concepts related to LLVM, not specific +to llvm-py. + + +# Intermediate Representation + +The intermediate representation, or IR for short, is an in-memory data +structure that represents executable code. The IR data structures allow +for creation of types, constants, functions, function arguments, +instructions, global variables and so on. For example, to create a +function _sum_ that takes two integers and returns their sum, we need to +follow these steps: + +- create an integer type _ti_ of required bitwidth +- create a function type _tf_ which takes two _ti_ -s and returns + another _ti_ +- create a function of type _tf_ named _sum_ +- add a _basic block_ to the function +- using a helper object called an _instruction builder_, add two + instructions into the basic block: + . an instruction to add the two arguments and store the result into + a temporary variable + . a return instruction to return the value of the temporary variable + +(A basic block is a block of instructions.) + +LLVM has it's own instruction set; the instructions used above (*add* +and *ret*) are from this set. The LLVM instructions are at a higher +level than the usual assembly language; for example there are +instructions related to variable argument handling, exception handling, +and garbage collection. These allow high-level languages to be +represented cleanly in the IR. + +* * * + +# SSA Form and PHI Nodes + +All LLVM instructions are represented in the _Static Single Assignment_ +(SSA) form. Essentially, this means that any variable can be assigned to +only once. Such a representation facilitates better optimization, among +other benefits. + +A consequence of single assignment are PHI (Φ) nodes. These +are required when a variable can be assigned a different value based on +the path of control flow. For example, the value of *b* at the end of +execution of the snippet below: + +{% highlight c %} +a = 1; +if (v < 10) + a = 2; +b = a; +{% endhighlight %} + +cannot be determined statically. The value of '2' cannot be assigned to +the 'original' *a*, since *a* can be assigned to only once. There are +two *a* 's in there, and the last assignment has to choose between which +version to pick. This is accomplished by adding a PHI node: + +{% highlight c %} +a1 = 1; +if (v < 10) + a2 = 2; +b = PHI(a1, a2); +{% endhighlight %} + +The PHI node selects *a1* or *a2*, depending on where the control +reached the PHI node. The argument *a1* of the PHI node is associated +with the block *"a1 = 1;"* and *a2* with the block *"a2 = 2;"*. + +PHI nodes have to be explicitly created in the LLVM IR. Accordingly the +LLVM instruction set has an instruction called *phi*. + + +* * * + +# LLVM Assembly Language + +The LLVM IR can be represented offline in two formats + +- a textual, human-readable form, similar to assembly language, + called the LLVM assembly language (files with .ll extension) +- a binary form, called the LLVM bitcode (files with .bc extension) + +All three formats +(the in-memory IR, the LLVM assembly language and the LLVM bitcode) +represent the _same_ information. Each format can be +converted into the other two formats (using LLVM APIs). + +The [LLVM demo page](http://www.llvm.org/demo/) lets you type in C or C++ +code, converts it into LLVM IR and outputs the IR as LLVM assembly +language code. + +Just to get a feel of the LLVM assembly language, here's a function in C, +and the corresponding LLVM assembly (as generated by the demo page): + +{% highlight c %} +/* compute sum of 1..n */ +unsigned sum(unsigned n) +{ + if (n == 0) + return 0; + else + return n + sum(n-1); +} +{% endhighlight %} + +The corresponding LLVM assembly: + +{% highlight llvm %} +; ModuleID = '/tmp/webcompile/_7149_0.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-linux-gnu" + +define i32 @sum(i32 %n) nounwind readnone { +entry: + %0 = icmp eq i32 %n, 0 ; [#uses=1] + br i1 %0, label %bb2, label %bb1 + +bb1: ; preds = %entry + %1 = add i32 %n, -1 ; [#uses=2] + %2 = icmp eq i32 %1, 0 ; [#uses=1] + br i1 %2, label %sum.exit, label %bb1.i + +bb1.i: ; preds = %bb1 + %3 = add i32 %n, -2 ; [#uses=1] + %4 = tail call i32 @sum(i32 %3) nounwind ; [#uses=1] + %5 = add i32 %4, %1 ; [#uses=1] + br label %sum.exit + +sum.exit: ; preds = %bb1.i, %bb1 + %6 = phi i32 [ %5, %bb1.i ], [ 0, %bb1 ] ; [#uses=1] + %7 = add i32 %6, %n ; [#uses=1] + ret i32 %7 + +bb2: ; preds = %entry + ret i32 0 +} +{% endhighlight %} + +Note the usage of SSA form. The long string called `target datalayout` is a +specification of the platform ABI (like endianness, sizes of types, +alignment etc.). + +The [LLVM Language Reference](http://www.llvm.org/docs/LangRef.html) +defines the LLVM assembly language including the entire instruction set. + + +* * * + +# Modules + +[Modules](./llvm.core.Module.html), in the LLVM IR, are similar to a single *C* language source +file (.c file). A module contains: + +- functions (declarations and definitions) +- global variables and constants +- global type aliases for structures + +Modules are top-level containers; all executable code representation is +contained within modules. Modules may be combined (linked) together to +give a bigger resultant module. During this process LLVM attempts to +reconcile the references between the combined modules. + + +* * * + +# Optimization and Passes + + +LLVM provides quite a few optimization algorithms that work on the IR. +These algorithms are organized as _passes_. Each pass does something +specific, like combining redundant instructions. Passes need not always +optimize the IR, it can also do other operations like inserting +instrumentation code, or analyzing the IR (the result of which can be +used by passes that do optimizations) or even printing call graphs. + +This LLVM [documentation page](http://www.llvm.org/docs/Passes.html) +describes all the available passes, and what they do. + +LLVM does not automatically choose to run any passes, anytime. Passes +have to be explicitly selected and run on each module. This gives you +the flexibility to choose transformations and optimizations that are +most suitable for the code in the module. + +There is an LLVM binary called [opt](http://www.llvm.org/cmds/opt.html), +which lets you run passes on bitcode files from the command line. You +can write your own passes (in C/C++, as a shared library). This can be +loaded and executed by +opt+. (Although llvm-py does not allow you to +write your own passes, it does allow you to navigate the entire IR at +any stage, and perform any transforms on it as you like.) + +A "pass manager" is responsible for loading passes, selecting the +correct objects to run them on (for example, a pass may work only +on functions, individually) and actually runs them. `opt` is a +command-line wrapper for the pass manager. + +LLVM defines two kinds of pass managers: + +* The [FunctionPassManager](http://llvm.org/docs/doxygen/html/classllvm_1_1FunctionPassManager.html) + manages function or basic-block passes. These lighter weight passes + can be used immediately after each generated function to reduce memory + footprint. + +* The [PassManager](http://llvm.org/docs/doxygen/html/classllvm_1_1PassManager.html) + manages module passes for optimizing the entire module. + +* * * + + +# Bitcode + +LLVM IR can be represented as a bitcode format for disk storage. It is [suitable for fast loading by JIT compiler](http://llvm.org/docs/LangRef.html#introduction). +See [LLVM documentation](http://llvm.org/docs/BitCodeFormat.html) +for detail about the bitcode format. + + +* * * + +# Execution Engine, JIT and Interpreter + +The *execution engine* implements execution of LLVM IR through an +interpreter or a JIT dynamic compiler. An *execution engine* can +contain multiple modules. + +> **Note** +> +> +> Inter-module reference is not possible. That is module `A` +> cannot call a function in module `B`, directly. + + +* * * + +**Next** -- [llvm-py Package](./llvm-py_package.html) + + diff --git a/docs/source/doc/types.md b/docs/source/doc/types.md new file mode 100644 index 0000000..c414061 --- /dev/null +++ b/docs/source/doc/types.md @@ -0,0 +1,146 @@ +--- +layout: page +title: Types +--- + +Types are what you think they are. A instance of [llvm.core.Type][], or +one of its derived classes, represent a type. llvm-py does not use as +many classes to represent types as does LLVM itself. Some types are +represented using [llvm.core.Type][] itself and the rest are represented +using derived classes of [llvm.core.Type][]. As usual, an instance is created +via one of the static methods of [Type][llvm.core.Type]. These methods return an +instance of either [llvm.core.Type][] itself or one of its derived +classes. + +The following table lists all the available types along with the static +method which has to be used to construct it and the name of the class whose +object is actually returned by the static method. + + +Name | Constructor Method | Class | +-----|:------------------:|:-----:| +integer of bitwidth *n* | Type.int(n) | [IntegerType][llvm.core.IntegerType] | +32-bit float | Type.float() | [Type][llvm.core.Type] | +64-bit double | Type.double() | [Type][llvm.core.Type] | +80-bit float | Type.x86_fp80() | [Type][llvm.core.Type] | +128-bit float (112-bit mantissa) | Type.fp128() | [Type][llvm.core.Type] | +128-bit float (two 64-bits) | Type.ppc_fp128() | [Type][llvm.core.Type] | +function | Type.function(r, p, v) | [FunctionType][llvm.core.FunctionType] | +unpacked struct | Type.struct(eltys, name) | [StructType][llvm.core.StructType] | +packed struct | Type.packed_struct(eltys, name) | [StructType][llvm.core.StructType] | +opaque struct | Type.opaque(name) | [StructType][llvm.core.StructType] | +array | Type.array(elty, count) | [ArrayType][llvm.core.ArrayType] | +pointer to value of type *pty* | Type.pointer(pty, addrspc) | [PointerType][llvm.core.PointerType] | +vector | Type.vector(elty, count) | [VectorType][llvm.core.VectorType] | +void | Type.void() | [Type][llvm.core.Type] | +label | Type.label() | [Type][llvm.core.Type] | + + +
+ + +The class hierarchy is: + + + Type + IntegerType + FunctionType + StructType + ArrayType + PointerType + VectorType + + + +* * * + +## An Example + +Here is an example that demonstrates the creation of types: + +{% highlight python %} +#!/usr/bin/env python + +# integers +int_ty = Type.int() +bool_ty = Type.int(1) +int_64bit = Type.int(64) + +# floats +sprec_real = Type.float() +dprec_real = Type.double() + +# arrays and vectors +intar_ty = Type.array( int_ty, 10 ) # "typedef int intar_ty[10];" +twodim = Type.array( intar_ty , 10 ) # "typedef int twodim[10][10];" +vec = Type.array( int_ty, 10 ) + +# structures +s1_ty = Type.struct( [ int_ty, sprec_real ] ) + # "struct s1_ty { int v1; float v2; };" + +# pointers +intptr_ty = Type.pointer(int_ty) # "typedef int *intptr_ty;" + +# functions +f1 = Type.function( int_ty, [ int_ty ] ) + # functions that take 1 int_ty and return 1 int_ty + +f2 = Type.function( Type.void(), [ int_ty, int_ty ] ) + # functions that take 2 int_tys and return nothing + +f3 = Type.function( Type.void(), ( int_ty, int_ty ) ) + # same as f2; any iterable can be used + +fnargs = [ Type.pointer( Type.int(8) ) ] +printf = Type.function( Type.int(), fnargs, True ) # variadic function +{% endhighlight %} + +* * * + +## Another Example: Recursive Type + +The type system was rewritten in LLVM 3.0. +The old opaque type was removed. +Instead, identified `StructType` can now be defined without a body. +Doing so creates a opaque structure. +One can then set the body after the construction of a structure. + + +(See [LLVM Blog](http://blog.llvm.org/2011/11/llvm-30-type-system-rewrite.html) +for detail about the new type system.) + +The following code defines a opaque structure, named "mystruct". +The body is defined after the construction using `StructType.set_body`. +The second subtype is a pointer to a "mystruct" type. + +{% highlight python %} +ts = Type.opaque('mystruct') +ts.set_body([Type.int(), Type.pointer(ts)]) +{% endhighlight %} + +* * * + +**Related Links** +[llvm.core.Type][], +[llvm.core.IntegerType][], +[llvm.core.FunctionType][], +[llvm.core.StructType][], +[llvm.core.ArrayType][], +[llvm.core.PointerType][], +[llvm.core.VectorType][], +[llvm.core.TypeHandle][] + + + + + +[llvm.core.Type]: llvm.core.Type.html +[llvm.core.IntegerType]: llvm.core.IntegerType.html +[llvm.core.FunctionType]: llvm.core.FunctionType.html +[llvm.core.StructType]: llvm.core.StructType.html +[llvm.core.ArrayType]: llvm.core.ArrayType.html +[llvm.core.PointerType]: llvm.core.PointerType.html +[llvm.core.VectorType]: llvm.core.VectorType.html +[llvm.core.TypeHandle]: llvm.core.TypeHandle.html + diff --git a/docs/source/doc/userguide.md b/docs/source/doc/userguide.md new file mode 100644 index 0000000..16c0bff --- /dev/null +++ b/docs/source/doc/userguide.md @@ -0,0 +1,159 @@ +--- +layout: page +title: User Guide +--- + +llvm-py provides Python bindings for LLVM. This document explains how you can setup and use it. A working knowledge of Python and a basic idea of LLVM is assumed. + +# Introduction + +[LLVM](http://www.llvm.org/) (Low-Level Virtual Machine) provides enough +infrastructure to use it as the backend for your compiled, or +JIT-compiled language. It provides extensive optimization support, and +static and dynamic (JIT) backends for many platforms. See the website at + to discover more. + +Python bindings for LLVM provides a gentler learning curve for working +with the LLVM APIs. It should also be easier to create working +prototypes and experimental languages using this medium. + +Together with [clang](http://clang.llvm.org/) or +[llvm-gcc](http://llvm.org/cmds/llvmgcc.html) it also a provides a means +to quickly instrument C and C++ sources. For e.g., llvm-gcc can be used to +generate the LLVM assembly for a given C source file, which can then be +loaded and manipulated (adding profiling code to every function, say) using +a llvm-py based Python script. + +## License +Both LLVM and llvm-py are distributed under (different) permissive +open source licenses. llvm-py uses the +[new BSD license](http://opensource.org/licenses/bsd-license.php). More +information is available [here](https://github.com/numba/llvm-py/blob/master/LICENSE). + +## Platforms +llvm-py has been built/tested/reported to work on various GNU/Linux +flavours, BSD, Mac OS X; on i386 and amd64 architectures. Windows is not +supported, for a variety of reasons. + +## Versions +llvm-py 0.8.2 requires version 3.1 of LLVM. It may not work with previous +versions. + +llvm-py has been built and tested with Python 2.7. It should work with +earlier versions. It has not been tried with Python 3.x (patches welcome). + + +* * * + + +# Installation + +The Git repo of llvm-py is at . You'll +need to build and install it before it can be used. +At least the following will be required for this: + +- C and C++ compilers (gcc/g++) +- Python itself +- Python development files (headers and libraries) +- LLVM, either installed or built + +On debian-based systems, the first three can be installed with the +command `sudo apt-get install gcc g++ python python-dev`. Ensure that your +distro's repository has the appropriate version of LLVM! + +It does not matter which compiler LLVM itself was built with (`g++`, +`llvm-g++` or any other); llvm-py can be built with any compiler. It has +been tried only with gcc/g++ though. + +## LLVM and `--enable-pic` + +The result of an LLVM build is a set of static libraries and object +files. The llvm-py contains an extension package that is built into a +shared object (_core.so) which links to these static libraries and +object files. It is therefore required that the LLVM libraries and +object files be built with the `-fPIC` option (generate position +independent code). Be sure to use the `--enable-pic` option while +configuring LLVM (default is no PIC), like this: + +{% highlight bash %} +~/llvm$ ./configure --enable-pic --enable-optimized +{% endhighlight %} + + +## llvm-config + +In order to build llvm-py, it's build script needs to know from where it +can invoke the llvm helper program, `llvm-config`. If you've installed +LLVM, then this will be available in your `PATH`, and nothing further +needs to be done. If you've built LLVM yourself, or for any reason +`llvm-config` is not in your `PATH`, you'll need to pass the full path +of `llvm-config` to the build script. + +You'll need to be 'root' to install llvm-py. Remember that your `PATH` +is different from that of 'root', so even if `llvm-config` is in your +`PATH`, it may not be available when you do `sudo`. + + + +## Steps + +Get 3.1 version of LLVM, build it. Make sure '--enable-pic' is passed to LLVM's 'configure'. + +Get llvm-py and install it: + +{% highlight bash %} +$ git clone git@github.com:numba/llvm-py.git +$ cd llvm-py +$ python setup.py install +{% endhighlight %} + +If you need to tell the build script where `llvm-config` is, do it this +way: + +{% highlight bash %} +$ python setup.py install --user --llvm-config=/home/mdevan/llvm/Release/bin/llvm-config +{% endhighlight %} + +To build a debug version of llvm-py, that links against the debug +libraries of LLVM, use this: + +{% highlight bash %} +$ python setup.py build -g --llvm-config=/home/mdevan/llvm/Debug/bin/llvm-config +$ python setup.py install --user --llvm-config=/home/mdevan/llvm/Debug/bin/llvm-config +{% endhighlight %} + +Be warned that debug binaries will be huge (100MB+) ! They are required +only if you need to debug into LLVM also. + +`setup.py` is a standard Python distutils script. See the Python +documentation regarding +[Installing Python Modules](http://docs.python.org/inst/inst.html) and +[Distributing Python Modules](http://docs.python.org/dist/dist.html) +for more information on such scripts. + +* * * + +# Uninstall # {#uninstall} + +If you'd installed llvm-py with the `--user` option, then llvm-py +would be present under `~/.local/lib/python2.7/site-packages`. +Otherwise, it might be under `/usr/lib/python2.7/site-packages` +or `/usr/local/lib/python2.7/site-packages`. The directory would +vary with your Python version and OS flavour. Look around. + +Once you've located the site-packages directory, the modules and +the "egg" can be removed like so: + +{% highlight bash %} +$ rm -rf /llvm /llvm_py-.egg-info +{% endhighlight %} + + +See the [Python documentation](http://docs.python.org/install/index.html) +for more information. + +* * * + +**Next** -- [LLVM concepts](./llvm_concepts.html) + + diff --git a/docs/source/doc/values.md b/docs/source/doc/values.md new file mode 100644 index 0000000..39ba6b2 --- /dev/null +++ b/docs/source/doc/values.md @@ -0,0 +1,88 @@ +--- +layout: page +title: Values +--- + +[llvm.core.Value][] is the base class of all values computed by a program +that may be used as operands to other values. A value has a type +associated with it (an object of [llvm.core.Type][]). + +The class hierarchy is: + + + Value + User + Constant + ConstantExpr + ConstantAggregateZero + ConstantInt + ConstantFP + ConstantArray + ConstantStruct + ConstantVector + ConstantPointerNull + UndefValue + GlobalValue + GlobalVariable + Function + Instruction + CallOrInvokeInstruction + PHINode + SwitchInstruction + CompareInstruction + Argument + BasicBlock + + +The [Value][llvm.core.Value] class is abstract, it's not meant to be +instantiated. [User][llvm.core.User] is a [Value][llvm.core.Value] + that in turn uses (i.e., can refer to) other values (for +e.g., a constant expression 1+2 refers to two constant values 1 and 2). + +[Constant][llvm.core.Constant]-s represent constants that appear within code or +as initializers of globals. They are constructed using static methods of +[Constant][llvm.core.Constant]. Various types of constants are represented by +various subclasses of [Constant][llvm.core.Constant]. +However, most of them are empty and do not provide any additional attributes or methods over [Constant][llvm.core.Constant]. + +The [Function][functions] object represents an instance of a +function type. Such objects contain [Argument][llvm.core.Argument] objects, +which represent the actual, +local-variable-like arguments of the function (not to be confused with +the arguments returned by a function _type_ object -- these represent +the _type_ of the arguments). + +The various [Instruction][llvm.core.Instruction]-s are created +by the [Builder][llvm.core.Builder] class. Most +instructions are represented by [Instruction][llvm.core.Instruction] itself, +but there are a few subclasses that represent interesting instructions. + +[Value][llvm.core.Value] objects have a type (read-only), +and a name (read-write). + +**Related Links** +[functions][], +[comparision][], +[llvm.core.Value][], +[llvm.core.User][], +[llvm.core.Constant][], +[llvm.core.GlobalValue][], +[llvm.core.GlobalVariable][], +[llvm.core.Argument][], +[llvm.core.Instruction][], +[llvm.core.Builder][], +[llvm.core.BasicBlock][] + + +[llvm.core.Type]: types.html +[functions]: functions.html +[comparision]: comparision.html +[llvm.core.Value]: llvm.core.Value.html +[llvm.core.User]: llvm.core.User.html +[llvm.core.Constant]: llvm.core.Constant.html +[llvm.core.GlobalValue]: llvm.core.GlobalValue.html +[llvm.core.GlobalVariable]: llvm.core.GlobalVariable.html +[llvm.core.Argument]: llvm.core.Argument.html +[llvm.core.Instruction]: llvm.core.Instruction.html +[llvm.core.Builder]: llvm.core.Builder.html +[llvm.core.BasicBlock]: llvm.core.BasicBlock.html diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..1ef8eaf --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,22 @@ +.. llvmpy documentation master file, created by + sphinx-quickstart on Wed Aug 8 17:33:58 2012. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to llvmpy's documentation! +================================== + +Contents: + +.. toctree:: + :maxdepth: 2 + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` +