diff --git a/documentation-tools/doc-converter/DocBook.hs b/documentation-tools/doc-converter/DocBook.hs new file mode 100644 index 0000000..151ae0c --- /dev/null +++ b/documentation-tools/doc-converter/DocBook.hs @@ -0,0 +1,1127 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{- +Copyright (C) 2006-2018 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.DocBook + Copyright : Copyright (C) 2006-2018 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion of DocBook XML to 'Pandoc' document. +-} +module Text.Pandoc.Readers.DocBook ( readDocBook ) where +import Prelude +import Control.Monad.State.Strict +import Data.Char (isSpace, toUpper) +import Data.Default +import Data.Either (rights) +import Data.Foldable (asum) +import Data.Generics +import Data.List (intersperse) +import Data.Maybe (fromMaybe) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Builder +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Options +import Text.Pandoc.Shared (crFilter, safeRead) +import Text.TeXMath (readMathML, writeTeX) +import Text.XML.Light + +{- + +List of all DocBook tags, with [x] indicating implemented, +[o] meaning intentionally left unimplemented (pass through): + +[o] abbrev - An abbreviation, especially one followed by a period +[x] abstract - A summary +[o] accel - A graphical user interface (GUI) keyboard shortcut +[x] ackno - Acknowledgements in an Article +[o] acronym - An often pronounceable word made from the initial +[o] action - A response to a user event +[o] address - A real-world address, generally a postal address +[ ] affiliation - The institutional affiliation of an individual +[ ] alt - Text representation for a graphical element +[o] anchor - A spot in the document +[x] answer - An answer to a question posed in a QandASet +[x] appendix - An appendix in a Book or Article +[x] appendixinfo - Meta-information for an Appendix +[o] application - The name of a software program +[x] area - A region defined for a Callout in a graphic or code example +[x] areaset - A set of related areas in a graphic or code example +[x] areaspec - A collection of regions in a graphic or code example +[ ] arg - An argument in a CmdSynopsis +[x] article - An article +[x] articleinfo - Meta-information for an Article +[ ] artpagenums - The page numbers of an article as published +[x] attribution - The source of a block quote or epigraph +[ ] audiodata - Pointer to external audio data +[ ] audioobject - A wrapper for audio data and its associated meta-information +[x] author - The name of an individual author +[ ] authorblurb - A short description or note about an author +[x] authorgroup - Wrapper for author information when a document has + multiple authors or collaborators +[x] authorinitials - The initials or other short identifier for an author +[o] beginpage - The location of a page break in a print version of the document +[ ] bibliocoverage - The spatial or temporal coverage of a document +[x] bibliodiv - A section of a Bibliography +[x] biblioentry - An entry in a Bibliography +[x] bibliography - A bibliography +[ ] bibliographyinfo - Meta-information for a Bibliography +[ ] biblioid - An identifier for a document +[o] bibliolist - A wrapper for a set of bibliography entries +[ ] bibliomisc - Untyped bibliographic information +[x] bibliomixed - An entry in a Bibliography +[ ] bibliomset - A cooked container for related bibliographic information +[ ] biblioref - A cross reference to a bibliographic entry +[ ] bibliorelation - The relationship of a document to another +[ ] biblioset - A raw container for related bibliographic information +[ ] bibliosource - The source of a document +[ ] blockinfo - Meta-information for a block element +[x] blockquote - A quotation set off from the main text +[x] book - A book +[x] bookinfo - Meta-information for a Book +[x] bridgehead - A free-floating heading +[x] callout - A “called out” description of a marked Area +[x] calloutlist - A list of Callouts +[x] caption - A caption +[x] caution - A note of caution +[x] chapter - A chapter, as of a book +[x] chapterinfo - Meta-information for a Chapter +[ ] citation - An inline bibliographic reference to another published work +[ ] citebiblioid - A citation of a bibliographic identifier +[ ] citerefentry - A citation to a reference page +[ ] citetitle - The title of a cited work +[ ] city - The name of a city in an address +[x] classname - The name of a class, in the object-oriented programming sense +[ ] classsynopsis - The syntax summary for a class definition +[ ] classsynopsisinfo - Information supplementing the contents of + a ClassSynopsis +[ ] cmdsynopsis - A syntax summary for a software command +[ ] co - The location of a callout embedded in text +[x] code - An inline code fragment +[x] col - Specifications for a column in an HTML table +[x] colgroup - A group of columns in an HTML table +[ ] collab - Identifies a collaborator +[ ] collabname - The name of a collaborator +[ ] colophon - Text at the back of a book describing facts about its production +[x] colspec - Specifications for a column in a table +[x] command - The name of an executable program or other software command +[x] computeroutput - Data, generally text, displayed or presented by a computer +[ ] confdates - The dates of a conference for which a document was written +[ ] confgroup - A wrapper for document meta-information about a conference +[ ] confnum - An identifier, frequently numerical, associated with a conference for which a document was written +[ ] confsponsor - The sponsor of a conference for which a document was written +[ ] conftitle - The title of a conference for which a document was written +[x] constant - A programming or system constant +[ ] constraint - A constraint in an EBNF production +[ ] constraintdef - The definition of a constraint in an EBNF production +[ ] constructorsynopsis - A syntax summary for a constructor +[ ] contractnum - The contract number of a document +[ ] contractsponsor - The sponsor of a contract +[ ] contrib - A summary of the contributions made to a document by a + credited source +[ ] copyright - Copyright information about a document +[ ] coref - A cross reference to a co +[ ] corpauthor - A corporate author, as opposed to an individual +[ ] corpcredit - A corporation or organization credited in a document +[ ] corpname - The name of a corporation +[ ] country - The name of a country +[ ] database - The name of a database, or part of a database +[x] date - The date of publication or revision of a document +[ ] dedication - A wrapper for the dedication section of a book +[ ] destructorsynopsis - A syntax summary for a destructor +[ ] edition - The name or number of an edition of a document +[ ] editor - The name of the editor of a document +[x] email - An email address +[x] emphasis - Emphasized text +[x] entry - A cell in a table +[ ] entrytbl - A subtable appearing in place of an Entry in a table +[ ] envar - A software environment variable +[x] epigraph - A short inscription at the beginning of a document or component + note: also handle embedded attribution tag +[x] equation - A displayed mathematical equation +[ ] errorcode - An error code +[ ] errorname - An error name +[ ] errortext - An error message. +[ ] errortype - The classification of an error message +[ ] example - A formal example, with a title +[ ] exceptionname - The name of an exception +[ ] fax - A fax number +[ ] fieldsynopsis - The name of a field in a class definition +[x] figure - A formal figure, generally an illustration, with a title +[x] filename - The name of a file +[ ] firstname - The first name of a person +[ ] firstterm - The first occurrence of a term +[x] footnote - A footnote +[ ] footnoteref - A cross reference to a footnote (a footnote mark) +[x] foreignphrase - A word or phrase in a language other than the primary + language of the document +[x] formalpara - A paragraph with a title +[ ] funcdef - A function (subroutine) name and its return type +[ ] funcparams - Parameters for a function referenced through a function + pointer in a synopsis +[ ] funcprototype - The prototype of a function +[ ] funcsynopsis - The syntax summary for a function definition +[ ] funcsynopsisinfo - Information supplementing the FuncDefs of a FuncSynopsis +[x] function - The name of a function or subroutine, as in a + programming language +[x] glossary - A glossary +[x] glossaryinfo - Meta-information for a Glossary +[x] glossdef - A definition in a GlossEntry +[x] glossdiv - A division in a Glossary +[x] glossentry - An entry in a Glossary or GlossList +[x] glosslist - A wrapper for a set of GlossEntrys +[x] glosssee - A cross-reference from one GlossEntry to another +[x] glossseealso - A cross-reference from one GlossEntry to another +[x] glossterm - A glossary term +[ ] graphic - A displayed graphical object (not an inline) + Note: in DocBook v5 `graphic` is discarded +[ ] graphicco - A graphic that contains callout areas + Note: in DocBook v5 `graphicco` is discarded +[ ] group - A group of elements in a CmdSynopsis +[ ] guibutton - The text on a button in a GUI +[ ] guiicon - Graphic and/or text appearing as a icon in a GUI +[ ] guilabel - The text of a label in a GUI +[x] guimenu - The name of a menu in a GUI +[x] guimenuitem - The name of a terminal menu item in a GUI +[x] guisubmenu - The name of a submenu in a GUI +[ ] hardware - A physical part of a computer system +[ ] highlights - A summary of the main points of the discussed component +[ ] holder - The name of the individual or organization that holds a copyright +[o] honorific - The title of a person +[ ] html:form - An HTML form +[x] imagedata - Pointer to external image data (only `fileref` attribute + implemented but not `entityref` which would require parsing of the DTD) +[x] imageobject - A wrapper for image data and its associated meta-information +[ ] imageobjectco - A wrapper for an image object with callouts +[x] important - An admonition set off from the text +[x] index - An index +[x] indexdiv - A division in an index +[x] indexentry - An entry in an index +[x] indexinfo - Meta-information for an Index +[x] indexterm - A wrapper for terms to be indexed +[x] info - A wrapper for information about a component or other block. (DocBook v5) +[x] informalequation - A displayed mathematical equation without a title +[x] informalexample - A displayed example without a title +[ ] informalfigure - A untitled figure +[ ] informaltable - A table without a title +[ ] initializer - The initializer for a FieldSynopsis +[x] inlineequation - A mathematical equation or expression occurring inline +[ ] inlinegraphic - An object containing or pointing to graphical data + that will be rendered inline +[x] inlinemediaobject - An inline media object (video, audio, image, and so on) +[ ] interface - An element of a GUI +[ ] interfacename - The name of an interface +[ ] invpartnumber - An inventory part number +[ ] isbn - The International Standard Book Number of a document +[ ] issn - The International Standard Serial Number of a periodical +[ ] issuenum - The number of an issue of a journal +[x] itemizedlist - A list in which each entry is marked with a bullet or + other dingbat +[ ] itermset - A set of index terms in the meta-information of a document +[ ] jobtitle - The title of an individual in an organization +[x] keycap - The text printed on a key on a keyboard +[ ] keycode - The internal, frequently numeric, identifier for a key + on a keyboard +[x] keycombo - A combination of input actions +[ ] keysym - The symbolic name of a key on a keyboard +[ ] keyword - One of a set of keywords describing the content of a document +[ ] keywordset - A set of keywords describing the content of a document +[ ] label - A label on a Question or Answer +[ ] legalnotice - A statement of legal obligations or requirements +[ ] lhs - The left-hand side of an EBNF production +[ ] lineage - The portion of a person's name indicating a relationship to + ancestors +[ ] lineannotation - A comment on a line in a verbatim listing +[x] link - A hypertext link +[x] listitem - A wrapper for the elements of a list item +[x] literal - Inline text that is some literal value +[x] literallayout - A block of text in which line breaks and white space are + to be reproduced faithfully +[ ] lot - A list of the titles of formal objects (as tables or figures) in + a document +[ ] lotentry - An entry in a list of titles +[ ] manvolnum - A reference volume number +[x] markup - A string of formatting markup in text that is to be + represented literally +[x] mathphrase - A mathematical phrase, an expression that can be represented + with ordinary text and a small amount of markup +[ ] medialabel - A name that identifies the physical medium on which some + information resides +[x] mediaobject - A displayed media object (video, audio, image, etc.) +[ ] mediaobjectco - A media object that contains callouts +[x] member - An element of a simple list +[x] menuchoice - A selection or series of selections from a menu +[ ] methodname - The name of a method +[ ] methodparam - Parameters to a method +[ ] methodsynopsis - A syntax summary for a method +[x] mml:math - A MathML equation +[ ] modespec - Application-specific information necessary for the + completion of an OLink +[ ] modifier - Modifiers in a synopsis +[ ] mousebutton - The conventional name of a mouse button +[ ] msg - A message in a message set +[ ] msgaud - The audience to which a message in a message set is relevant +[ ] msgentry - A wrapper for an entry in a message set +[ ] msgexplan - Explanatory material relating to a message in a message set +[ ] msginfo - Information about a message in a message set +[ ] msglevel - The level of importance or severity of a message in a message set +[ ] msgmain - The primary component of a message in a message set +[ ] msgorig - The origin of a message in a message set +[ ] msgrel - A related component of a message in a message set +[ ] msgset - A detailed set of messages, usually error messages +[ ] msgsub - A subcomponent of a message in a message set +[ ] msgtext - The actual text of a message component in a message set +[ ] nonterminal - A non-terminal in an EBNF production +[x] note - A message set off from the text +[ ] objectinfo - Meta-information for an object +[ ] olink - A link that addresses its target indirectly, through an entity +[ ] ooclass - A class in an object-oriented programming language +[ ] ooexception - An exception in an object-oriented programming language +[ ] oointerface - An interface in an object-oriented programming language +[x] option - An option for a software command +[x] optional - Optional information +[x] orderedlist - A list in which each entry is marked with a sequentially + incremented label +[ ] orgdiv - A division of an organization +[ ] orgname - The name of an organization other than a corporation +[ ] otheraddr - Uncategorized information in address +[ ] othercredit - A person or entity, other than an author or editor, + credited in a document +[ ] othername - A component of a persons name that is not a first name, + surname, or lineage +[ ] package - A package +[ ] pagenums - The numbers of the pages in a book, for use in a bibliographic + entry +[x] para - A paragraph +[ ] paramdef - Information about a function parameter in a programming language +[x] parameter - A value or a symbolic reference to a value +[ ] part - A division in a book +[ ] partinfo - Meta-information for a Part +[ ] partintro - An introduction to the contents of a part +[ ] personblurb - A short description or note about a person +[ ] personname - The personal name of an individual +[ ] phone - A telephone number +[ ] phrase - A span of text +[ ] pob - A post office box in an address +[ ] postcode - A postal code in an address +[x] preface - Introductory matter preceding the first chapter of a book +[ ] prefaceinfo - Meta-information for a Preface +[ ] primary - The primary word or phrase under which an index term should be + sorted +[ ] primaryie - A primary term in an index entry, not in the text +[ ] printhistory - The printing history of a document +[ ] procedure - A list of operations to be performed in a well-defined sequence +[ ] production - A production in a set of EBNF productions +[ ] productionrecap - A cross-reference to an EBNF production +[ ] productionset - A set of EBNF productions +[ ] productname - The formal name of a product +[ ] productnumber - A number assigned to a product +[x] programlisting - A literal listing of all or part of a program +[ ] programlistingco - A program listing with associated areas used in callouts +[x] prompt - A character or string indicating the start of an input field in + a computer display +[ ] property - A unit of data associated with some part of a computer system +[ ] pubdate - The date of publication of a document +[ ] publisher - The publisher of a document +[ ] publishername - The name of the publisher of a document +[ ] pubsnumber - A number assigned to a publication other than an ISBN or ISSN + or inventory part number +[x] qandadiv - A titled division in a QandASet +[o] qandaentry - A question/answer set within a QandASet +[o] qandaset - A question-and-answer set +[x] question - A question in a QandASet +[x] quote - An inline quotation +[ ] refclass - The scope or other indication of applicability of a + reference entry +[ ] refdescriptor - A description of the topic of a reference page +[ ] refentry - A reference page (originally a UNIX man-style reference page) +[ ] refentryinfo - Meta-information for a Refentry +[ ] refentrytitle - The title of a reference page +[ ] reference - A collection of reference entries +[ ] referenceinfo - Meta-information for a Reference +[ ] refmeta - Meta-information for a reference entry +[ ] refmiscinfo - Meta-information for a reference entry other than the title + and volume number +[ ] refname - The name of (one of) the subject(s) of a reference page +[ ] refnamediv - The name, purpose, and classification of a reference page +[ ] refpurpose - A short (one sentence) synopsis of the topic of a reference + page +[x] refsect1 - A major subsection of a reference entry +[x] refsect1info - Meta-information for a RefSect1 +[x] refsect2 - A subsection of a RefSect1 +[x] refsect2info - Meta-information for a RefSect2 +[x] refsect3 - A subsection of a RefSect2 +[x] refsect3info - Meta-information for a RefSect3 +[x] refsection - A recursive section in a refentry +[x] refsectioninfo - Meta-information for a refsection +[ ] refsynopsisdiv - A syntactic synopsis of the subject of the reference page +[ ] refsynopsisdivinfo - Meta-information for a RefSynopsisDiv +[x] releaseinfo - Information about a particular release of a document +[ ] remark - A remark (or comment) intended for presentation in a draft + manuscript +[ ] replaceable - Content that may or must be replaced by the user +[ ] returnvalue - The value returned by a function +[ ] revdescription - A extended description of a revision to a document +[ ] revhistory - A history of the revisions to a document +[ ] revision - An entry describing a single revision in the history of the + revisions to a document +[ ] revnumber - A document revision number +[ ] revremark - A description of a revision to a document +[ ] rhs - The right-hand side of an EBNF production +[x] row - A row in a table +[ ] sbr - An explicit line break in a command synopsis +[x] screen - Text that a user sees or might see on a computer screen +[o] screenco - A screen with associated areas used in callouts +[o] screeninfo - Information about how a screen shot was produced +[ ] screenshot - A representation of what the user sees or might see on a + computer screen +[ ] secondary - A secondary word or phrase in an index term +[ ] secondaryie - A secondary term in an index entry, rather than in the text +[x] sect1 - A top-level section of document +[x] sect1info - Meta-information for a Sect1 +[x] sect2 - A subsection within a Sect1 +[x] sect2info - Meta-information for a Sect2 +[x] sect3 - A subsection within a Sect2 +[x] sect3info - Meta-information for a Sect3 +[x] sect4 - A subsection within a Sect3 +[x] sect4info - Meta-information for a Sect4 +[x] sect5 - A subsection within a Sect4 +[x] sect5info - Meta-information for a Sect5 +[x] section - A recursive section +[x] sectioninfo - Meta-information for a recursive section +[x] see - Part of an index term directing the reader instead to another entry + in the index +[x] seealso - Part of an index term directing the reader also to another entry + in the index +[ ] seealsoie - A See also entry in an index, rather than in the text +[ ] seeie - A See entry in an index, rather than in the text +[x] seg - An element of a list item in a segmented list +[x] seglistitem - A list item in a segmented list +[x] segmentedlist - A segmented list, a list of sets of elements +[x] segtitle - The title of an element of a list item in a segmented list +[ ] seriesvolnums - Numbers of the volumes in a series of books +[ ] set - A collection of books +[ ] setindex - An index to a set of books +[ ] setindexinfo - Meta-information for a SetIndex +[ ] setinfo - Meta-information for a Set +[ ] sgmltag - A component of SGML markup +[ ] shortaffil - A brief description of an affiliation +[ ] shortcut - A key combination for an action that is also accessible through + a menu +[ ] sidebar - A portion of a document that is isolated from the main + narrative flow +[ ] sidebarinfo - Meta-information for a Sidebar +[x] simpara - A paragraph that contains only text and inline markup, no block + elements +[x] simplelist - An undecorated list of single words or short phrases +[ ] simplemsgentry - A wrapper for a simpler entry in a message set +[ ] simplesect - A section of a document with no subdivisions +[ ] spanspec - Formatting information for a spanned column in a table +[ ] state - A state or province in an address +[ ] step - A unit of action in a procedure +[ ] stepalternatives - Alternative steps in a procedure +[ ] street - A street address in an address +[ ] structfield - A field in a structure (in the programming language sense) +[ ] structname - The name of a structure (in the programming language sense) +[ ] subject - One of a group of terms describing the subject matter of a + document +[ ] subjectset - A set of terms describing the subject matter of a document +[ ] subjectterm - A term in a group of terms describing the subject matter of + a document +[x] subscript - A subscript (as in H2O, the molecular formula for water) +[ ] substeps - A wrapper for steps that occur within steps in a procedure +[x] subtitle - The subtitle of a document +[x] superscript - A superscript (as in x2, the mathematical notation for x + multiplied by itself) +[ ] surname - A family name; in western cultures the last name +[ ] svg:svg - An SVG graphic +[x] symbol - A name that is replaced by a value before processing +[ ] synopfragment - A portion of a CmdSynopsis broken out from the main body + of the synopsis +[ ] synopfragmentref - A reference to a fragment of a command synopsis +[ ] synopsis - A general-purpose element for representing the syntax of + commands or functions +[ ] systemitem - A system-related item or term +[ ] table - A formal table in a document +[ ] task - A task to be completed +[ ] taskprerequisites - The prerequisites for a task +[ ] taskrelated - Information related to a task +[ ] tasksummary - A summary of a task +[x] tbody - A wrapper for the rows of a table or informal table +[x] td - A table entry in an HTML table +[x] term - The word or phrase being defined or described in a variable list +[ ] termdef - An inline term definition +[ ] tertiary - A tertiary word or phrase in an index term +[ ] tertiaryie - A tertiary term in an index entry, rather than in the text +[ ] textdata - Pointer to external text data +[ ] textobject - A wrapper for a text description of an object and its + associated meta-information +[ ] tfoot - A table footer consisting of one or more rows +[x] tgroup - A wrapper for the main content of a table, or part of a table +[x] th - A table header entry in an HTML table +[x] thead - A table header consisting of one or more rows +[x] tip - A suggestion to the user, set off from the text +[x] title - The text of the title of a section of a document or of a formal + block-level element +[x] titleabbrev - The abbreviation of a Title +[x] toc - A table of contents +[x] tocback - An entry in a table of contents for a back matter component +[x] tocchap - An entry in a table of contents for a component in the body of + a document +[x] tocentry - A component title in a table of contents +[x] tocfront - An entry in a table of contents for a front matter component +[x] toclevel1 - A top-level entry within a table of contents entry for a + chapter-like component +[x] toclevel2 - A second-level entry within a table of contents entry for a + chapter-like component +[x] toclevel3 - A third-level entry within a table of contents entry for a + chapter-like component +[x] toclevel4 - A fourth-level entry within a table of contents entry for a + chapter-like component +[x] toclevel5 - A fifth-level entry within a table of contents entry for a + chapter-like component +[x] tocpart - An entry in a table of contents for a part of a book +[ ] token - A unit of information +[x] tr - A row in an HTML table +[ ] trademark - A trademark +[x] type - The classification of a value +[x] ulink - A link that addresses its target by means of a URL + (Uniform Resource Locator) +[x] uri - A Uniform Resource Identifier +[x] userinput - Data entered by the user +[x] varargs - An empty element in a function synopsis indicating a variable + number of arguments +[x] variablelist - A list in which each entry is composed of a set of one or + more terms and an associated description +[x] varlistentry - A wrapper for a set of terms and the associated description + in a variable list +[x] varname - The name of a variable +[ ] videodata - Pointer to external video data +[ ] videoobject - A wrapper for video data and its associated meta-information +[ ] void - An empty element in a function synopsis indicating that the + function in question takes no arguments +[ ] volumenum - The volume number of a document in a set (as of books in a set + or articles in a journal) +[x] warning - An admonition set off from the text +[x] wordasword - A word meant specifically as a word and not representing + anything else +[x] xref - A cross reference to another part of the document +[ ] year - The year of publication of a document +[x] ?asciidoc-br? - line break from asciidoc docbook output +-} + +type DB m = StateT DBState m + +data DBState = DBState{ dbSectionLevel :: Int + , dbQuoteType :: QuoteType + , dbMeta :: Meta + , dbBook :: Bool + , dbFigureTitle :: Inlines + , dbContent :: [Content] + } deriving Show + +instance Default DBState where + def = DBState{ dbSectionLevel = 0 + , dbQuoteType = DoubleQuote + , dbMeta = mempty + , dbBook = False + , dbFigureTitle = mempty + , dbContent = [] } + + +readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readDocBook _ inp = do + let tree = normalizeTree . parseXML . handleInstructions + $ T.unpack $ crFilter inp + (bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree + return $ Pandoc (dbMeta st') (toList . mconcat $ bs) + +-- We treat specially (issue #1236), converting it +-- to
, since xml-light doesn't parse the instruction correctly. +-- Other xml instructions are simply removed from the input stream. +handleInstructions :: String -> String +handleInstructions ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions xs +handleInstructions xs = case break (=='<') xs of + (ys, []) -> ys + ([], '<':zs) -> '<' : handleInstructions zs + (ys, zs) -> ys ++ handleInstructions zs + +getFigure :: PandocMonad m => Element -> DB m Blocks +getFigure e = do + tit <- case filterChild (named "title") e of + Just t -> getInlines t + Nothing -> return mempty + modify $ \st -> st{ dbFigureTitle = tit } + res <- getBlocks e + modify $ \st -> st{ dbFigureTitle = mempty } + return res + +-- normalize input, consolidating adjacent Text and CRef elements +normalizeTree :: [Content] -> [Content] +normalizeTree = everywhere (mkT go) + where go :: [Content] -> [Content] + go (Text (CData CDataRaw _ _):xs) = xs + go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = + Text (CData CDataText (s1 ++ s2) z):xs + go (Text (CData CDataText s1 z):CRef r:xs) = + Text (CData CDataText (s1 ++ convertEntity r) z):xs + go (CRef r:Text (CData CDataText s1 z):xs) = + Text (CData CDataText (convertEntity r ++ s1) z):xs + go (CRef r1:CRef r2:xs) = + Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + go xs = xs + +convertEntity :: String -> String +convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) + +-- convenience function to get an attribute value, defaulting to "" +attrValue :: String -> Element -> String +attrValue attr elt = + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) + +-- convenience function +named :: String -> Element -> Bool +named s e = qName (elName e) == s + +-- + +addMetadataFromElement :: PandocMonad m => Element -> DB m Blocks +addMetadataFromElement e = do + case filterChild (named "title") e of + Nothing -> return () + Just z -> do + getInlines z >>= addMeta "title" + addMetaField "subtitle" z + case filterChild (named "authorgroup") e of + Nothing -> return () + Just z -> addMetaField "author" z + addMetaField "subtitle" e + addMetaField "author" e + addMetaField "date" e + addMetaField "release" e + return mempty + where addMetaField fieldname elt = + case filterChildren (named fieldname) elt of + [] -> return () + [z] -> getInlines z >>= addMeta fieldname + zs -> mapM getInlines zs >>= addMeta fieldname + +addMeta :: PandocMonad m => ToMetaValue a => String -> a -> DB m () +addMeta field val = modify (setMeta field val) + +instance HasMeta DBState where + setMeta field v s = s {dbMeta = setMeta field v (dbMeta s)} + deleteMeta field s = s {dbMeta = deleteMeta field (dbMeta s)} + +isBlockElement :: Content -> Bool +isBlockElement (Elem e) = qName (elName e) `elem` blocktags + where blocktags = ["toc","index","para","formalpara","simpara", + "ackno","epigraph","blockquote","bibliography","bibliodiv", + "biblioentry","glossee","glosseealso","glossary", + "glossdiv","glosslist","chapter","appendix","preface", + "bridgehead","sect1","sect2","sect3","sect4","sect5","section", + "refsect1","refsect2","refsect3","refsection", + "important","caution","note","tip","warning","qandadiv", + "question","answer","abstract","itemizedlist","orderedlist", + "variablelist","article","book","table","informaltable", + "informalexample", "linegroup", + "screen","programlisting","example","calloutlist"] +isBlockElement _ = False + +-- Trim leading and trailing newline characters +trimNl :: String -> String +trimNl = reverse . go . reverse . go + where go ('\n':xs) = xs + go xs = xs + +-- meld text into beginning of first paragraph of Blocks. +-- assumes Blocks start with a Para; if not, does nothing. +addToStart :: Inlines -> Blocks -> Blocks +addToStart toadd bs = + case toList bs of + (Para xs : rest) -> para (toadd <> fromList xs) <> fromList rest + _ -> bs + +-- function that is used by both mediaobject (in parseBlock) +-- and inlinemediaobject (in parseInline) +-- A DocBook mediaobject is a wrapper around a set of alternative presentations +getMediaobject :: PandocMonad m => Element -> DB m Inlines +getMediaobject e = do + (imageUrl, attr) <- + case filterChild (named "imageobject") e of + Nothing -> return (mempty, nullAttr) + Just z -> case filterChild (named "imagedata") z of + Nothing -> return (mempty, nullAttr) + Just i -> let atVal a = attrValue a i + w = case atVal "width" of + "" -> [] + d -> [("width", d)] + h = case atVal "depth" of + "" -> [] + d -> [("height", d)] + atr = (atVal "id", words $ atVal "role", w ++ h) + in return (atVal "fileref", atr) + let getCaption el = case filterChild (\x -> named "caption" x + || named "textobject" x + || named "alt" x) el of + Nothing -> return mempty + Just z -> mconcat <$> + mapM parseInline (elContent z) + figTitle <- gets dbFigureTitle + let (caption, title) = if isNull figTitle + then (getCaption e, "") + else (return figTitle, "fig:") + fmap (imageWith attr imageUrl title) caption + +getBlocks :: PandocMonad m => Element -> DB m Blocks +getBlocks e = mconcat <$> + mapM parseBlock (elContent e) + + +parseBlock :: PandocMonad m => Content -> DB m Blocks +parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE +parseBlock (Text (CData _ s _)) = if all isSpace s + then return mempty + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ map toUpper x +parseBlock (Elem e) = + case qName (elName e) of + "remark"-> return mempty + "toc" -> return mempty -- skip TOC, since in pandoc it's autogenerated + "index" -> return mempty -- skip index, since page numbers meaningless + "para" -> parseMixed para (elContent e) + "formalpara" -> do + tit <- case filterChild (named "title") e of + Just t -> (para . strong . (<> str ".")) <$> + getInlines t + Nothing -> return mempty + (tit <>) <$> parseMixed para (elContent e) + "simpara" -> parseMixed para (elContent e) + "ackno" -> parseMixed para (elContent e) + "epigraph" -> parseBlockquote + "blockquote" -> parseBlockquote + "attribution" -> return mempty + "titleabbrev" -> return mempty + "authorinitials" -> return mempty + "bibliography" -> sect 0 + "bibliodiv" -> sect 1 + "biblioentry" -> parseMixed para (elContent e) + "bibliomixed" -> parseMixed para (elContent e) + "equation" -> para <$> equation e displayMath + "informalequation" -> para <$> equation e displayMath + "glosssee" -> para . (\ils -> text "See " <> ils <> str ".") + <$> getInlines e + "glossseealso" -> para . (\ils -> text "See also " <> ils <> str ".") + <$> getInlines e + "glossary" -> sect 0 + "glossdiv" -> definitionList <$> + mapM parseGlossEntry (filterChildren (named "glossentry") e) + "glosslist" -> definitionList <$> + mapM parseGlossEntry (filterChildren (named "glossentry") e) + "chapter" -> sect 0 + "appendix" -> sect 0 + "preface" -> sect 0 + "bridgehead" -> para . strong <$> getInlines e + "sect1" -> sect 1 + "sect2" -> sect 2 + "sect3" -> sect 3 + "sect4" -> sect 4 + "sect5" -> sect 5 + "section" -> gets dbSectionLevel >>= sect . (+1) + "refsect1" -> sect 1 + "refsect2" -> sect 2 + "refsect3" -> sect 3 + "refsection" -> gets dbSectionLevel >>= sect . (+1) + "important" -> divWith ("", ["important"], []) <$> + getBlocks e + "caution" -> blockQuote . (para (strong $ str "Caution") <>) + <$> getBlocks e + "note" -> divWith ("", ["note"], []) <$> + getBlocks e + "tip" -> blockQuote . (para (strong $ str "Tip") <>) + <$> getBlocks e + "warning" -> divWith ("", ["warning"], []) <$> + getBlocks e + "area" -> return mempty + "areaset" -> return mempty + "areaspec" -> return mempty + "qandadiv" -> gets dbSectionLevel >>= sect . (+1) + "question" -> addToStart (strong (str "Q:") <> str " ") <$> getBlocks e + "answer" -> addToStart (strong (str "A:") <> str " ") <$> getBlocks e + "abstract" -> blockQuote <$> getBlocks e + "calloutlist" -> bulletList <$> callouts + "itemizedlist" -> bulletList <$> listitems + "orderedlist" -> do + let listStyle = case attrValue "numeration" e of + "arabic" -> Decimal + "loweralpha" -> LowerAlpha + "upperalpha" -> UpperAlpha + "lowerroman" -> LowerRoman + "upperroman" -> UpperRoman + _ -> Decimal + let start = fromMaybe 1 $ + (attrValue "override" <$> filterElement (named "listitem") e) + >>= safeRead + orderedListWith (start,listStyle,DefaultDelim) + <$> listitems + "variablelist" -> definitionList <$> deflistitems + "figure" -> getFigure e + "mediaobject" -> para <$> getMediaobject e + "caption" -> return mempty + "info" -> addMetadataFromElement e + "articleinfo" -> addMetadataFromElement e + "sectioninfo" -> return mempty -- keywords & other metadata + "refsectioninfo" -> return mempty -- keywords & other metadata + "refsect1info" -> return mempty -- keywords & other metadata + "refsect2info" -> return mempty -- keywords & other metadata + "refsect3info" -> return mempty -- keywords & other metadata + "sect1info" -> return mempty -- keywords & other metadata + "sect2info" -> return mempty -- keywords & other metadata + "sect3info" -> return mempty -- keywords & other metadata + "sect4info" -> return mempty -- keywords & other metadata + "sect5info" -> return mempty -- keywords & other metadata + "chapterinfo" -> return mempty -- keywords & other metadata + "glossaryinfo" -> return mempty -- keywords & other metadata + "appendixinfo" -> return mempty -- keywords & other metadata + "bookinfo" -> addMetadataFromElement e + "article" -> modify (\st -> st{ dbBook = False }) >> + addMetadataFromElement e >> getBlocks e + "book" -> modify (\st -> st{ dbBook = True }) >> + addMetadataFromElement e >> getBlocks e + "table" -> parseTable + "informaltable" -> parseTable + "informalexample" -> divWith ("", ["informalexample"], []) <$> + getBlocks e + "linegroup" -> lineBlock <$> lineItems + "literallayout" -> codeBlockWithLang + "screen" -> codeBlockWithLang + "programlisting" -> codeBlockWithLang + "synopsis" -> synopsis + "?xml" -> return mempty + "title" -> return mempty -- handled in parent element + "subtitle" -> return mempty -- handled in parent element + _ -> getBlocks e + where parseMixed container conts = do + let (ils,rest) = break isBlockElement conts + ils' <- (trimInlines . mconcat) <$> mapM parseInline ils + let p = if ils' == mempty then mempty else container ils' + case rest of + [] -> return p + (r:rs) -> do + b <- parseBlock r + x <- parseMixed container rs + return $ p <> b <> x + codeBlockWithLang = do + let classes' = case attrValue "language" e of + "" -> [] + x -> [x] + return $ codeBlockWith (attrValue "id" e, classes', []) + $ trimNl $ strContentRecursive e + + synopsis = do + return $ codeBlockWith (attrValue "id" e, ["synopsis"], []) + $ trimNl $ strContentRecursive e + + parseBlockquote = do + attrib <- case filterChild (named "attribution") e of + Nothing -> return mempty + Just z -> (para . (str "— " <>) . mconcat) + <$> + mapM parseInline (elContent z) + contents <- getBlocks e + return $ blockQuote (contents <> attrib) + listitems = mapM getBlocks $ filterChildren (named "listitem") e + callouts = mapM getBlocks $ filterChildren (named "callout") e + deflistitems = mapM parseVarListEntry $ filterChildren + (named "varlistentry") e + parseVarListEntry e' = do + let terms = filterChildren (named "term") e' + let items = filterChildren (named "listitem") e' + terms' <- mapM getInlines terms + items' <- mapM getBlocks items + return (mconcat $ intersperse (str "; ") terms', items') + parseGlossEntry e' = do + let terms = filterChildren (named "glossterm") e' + let items = filterChildren (named "glossdef") e' + terms' <- mapM getInlines terms + items' <- mapM getBlocks items + return (mconcat $ intersperse (str "; ") terms', items') + parseTable = do + let isCaption x = named "title" x || named "caption" x + caption <- case filterChild isCaption e of + Just t -> getInlines t + Nothing -> return mempty + let e' = fromMaybe e $ filterChild (named "tgroup") e + let isColspec x = named "colspec" x || named "col" x + let colspecs = case filterChild (named "colgroup") e' of + Just c -> filterChildren isColspec c + _ -> filterChildren isColspec e' + let isRow x = named "row" x || named "tr" x + headrows <- case filterChild (named "thead") e' of + Just h -> case filterChild isRow h of + Just x -> parseRow x + Nothing -> return [] + Nothing -> return [] + bodyrows <- case filterChild (named "tbody") e' of + Just b -> mapM parseRow + $ filterChildren isRow b + Nothing -> mapM parseRow + $ filterChildren isRow e' + let toAlignment c = case findAttr (unqual "align") c of + Just "left" -> AlignLeft + Just "right" -> AlignRight + Just "center" -> AlignCenter + _ -> AlignDefault + let toWidth c = case findAttr (unqual "colwidth") c of + Just w -> fromMaybe 0 + $ safeRead $ '0': filter (\x -> + (x >= '0' && x <= '9') + || x == '.') w + Nothing -> 0 :: Double + let numrows = case bodyrows of + [] -> 0 + xs -> maximum $ map length xs + let aligns = case colspecs of + [] -> replicate numrows AlignDefault + cs -> map toAlignment cs + let widths = case colspecs of + [] -> replicate numrows 0 + cs -> let ws = map toWidth cs + tot = sum ws + in if all (> 0) ws + then map (/ tot) ws + else replicate numrows 0 + let headrows' = if null headrows + then replicate numrows mempty + else headrows + return $ table caption (zip aligns widths) + headrows' bodyrows + isEntry x = named "entry" x || named "td" x || named "th" x + parseRow = mapM (parseMixed plain . elContent) . filterChildren isEntry + sect n = do isbook <- gets dbBook + let n' = if isbook || n == 0 then n + 1 else n + headerText <- case filterChild (named "title") e `mplus` + (filterChild (named "info") e >>= + filterChild (named "title")) of + Just t -> getInlines t + Nothing -> return mempty + modify $ \st -> st{ dbSectionLevel = n } + b <- getBlocks e + let ident = attrValue "id" e + modify $ \st -> st{ dbSectionLevel = n - 1 } + return $ headerWith (ident,[],[]) n' headerText <> b + lineItems = mapM getInlines $ filterChildren (named "line") e + +getInlines :: PandocMonad m => Element -> DB m Inlines +getInlines e' = (trimInlines . mconcat) <$> + mapM parseInline (elContent e') + +strContentRecursive :: Element -> String +strContentRecursive = strContent . + (\e' -> e'{ elContent = map elementToStr $ elContent e' }) + +elementToStr :: Content -> Content +elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing +elementToStr x = x + +parseInline :: PandocMonad m => Content -> DB m Inlines +parseInline (Text (CData _ s _)) = return $ text s +parseInline (CRef ref) = + return $ maybe (text $ map toUpper ref) text $ lookupEntity ref +parseInline (Elem e) = + case qName (elName e) of + "equation" -> equation e displayMath + "informalequation" -> equation e displayMath + "inlineequation" -> equation e math + "subscript" -> subscript <$> innerInlines + "superscript" -> superscript <$> innerInlines + "inlinemediaobject" -> getMediaobject e + "quote" -> do + qt <- gets dbQuoteType + let qt' = if qt == SingleQuote then DoubleQuote else SingleQuote + modify $ \st -> st{ dbQuoteType = qt' } + contents <- innerInlines + modify $ \st -> st{ dbQuoteType = qt } + return $ if qt == SingleQuote + then singleQuoted contents + else doubleQuoted contents + "simplelist" -> simpleList + "segmentedlist" -> segmentedList + "classname" -> codeWithLang + "code" -> codeWithLang + "filename" -> codeWithLang + "literal" -> codeWithLang + "computeroutput" -> codeWithLang + "prompt" -> codeWithLang + "parameter" -> codeWithLang + "option" -> codeWithLang + "optional" -> do x <- getInlines e + return $ str "[" <> x <> str "]" + "markup" -> codeWithLang + "wordasword" -> emph <$> innerInlines + "command" -> codeWithLang + "varname" -> codeWithLang + "function" -> codeWithLang + "type" -> codeWithLang + "symbol" -> codeWithLang + "constant" -> codeWithLang + "userinput" -> codeWithLang + "varargs" -> return $ code "(...)" + "keycap" -> return (str $ strContent e) + "keycombo" -> keycombo <$> + mapM parseInline (elContent e) + "menuchoice" -> menuchoice <$> + mapM parseInline ( + filter isGuiMenu $ elContent e) + "xref" -> do + content <- dbContent <$> get + let linkend = attrValue "linkend" e + let title = case attrValue "endterm" e of + "" -> maybe "???" xrefTitleByElem + (findElementById linkend content) + endterm -> maybe "???" strContent + (findElementById endterm content) + return $ link ('#' : linkend) "" (text title) + "email" -> return $ link ("mailto:" ++ strContent e) "" + $ str $ strContent e + "uri" -> return $ link (strContent e) "" $ str $ strContent e + "ulink" -> link (attrValue "url" e) "" <$> innerInlines + "link" -> do + ils <- innerInlines + let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of + Just h -> h + _ -> '#' : attrValue "linkend" e + let ils' = if ils == mempty then str href else ils + let attr = (attrValue "id" e, words $ attrValue "role" e, []) + return $ linkWith attr href "" ils' + "foreignphrase" -> emph <$> innerInlines + "emphasis" -> case attrValue "role" e of + "bold" -> strong <$> innerInlines + "strong" -> strong <$> innerInlines + "strikethrough" -> strikeout <$> innerInlines + _ -> emph <$> innerInlines + "footnote" -> (note . mconcat) <$> + mapM parseBlock (elContent e) + "title" -> return mempty + "affiliation" -> return mempty + -- Note: this isn't a real docbook tag; it's what we convert + -- to in handleInstructions, above. A kludge to + -- work around xml-light's inability to parse an instruction. + "br" -> return linebreak + _ -> innerInlines + where innerInlines = (trimInlines . mconcat) <$> + mapM parseInline (elContent e) + codeWithLang = do + let classes' = case attrValue "language" e of + "" -> [] + l -> [l] + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e + simpleList = (mconcat . intersperse (str "," <> space)) <$> mapM getInlines + (filterChildren (named "member") e) + segmentedList = do + tit <- maybe (return mempty) getInlines $ filterChild (named "title") e + segtits <- mapM getInlines $ filterChildren (named "segtitle") e + segitems <- mapM (mapM getInlines . filterChildren (named "seg")) + $ filterChildren (named "seglistitem") e + let toSeg = mconcat . zipWith (\x y -> strong (x <> str ":") <> space <> + y <> linebreak) segtits + let segs = mconcat $ map toSeg segitems + let tit' = if tit == mempty + then mempty + else strong tit <> linebreak + return $ linebreak <> tit' <> segs + keycombo = spanWith ("",["keycombo"],[]) . + mconcat . intersperse (str "+") + menuchoice = spanWith ("",["menuchoice"],[]) . + mconcat . intersperse (text " > ") + isGuiMenu (Elem x) = named "guimenu" x || named "guisubmenu" x || + named "guimenuitem" x + isGuiMenu _ = False + + findElementById idString content + = asum [filterElement (\x -> attrValue "id" x == idString) el | Elem el <- content] + + -- Use the 'xreflabel' attribute for getting the title of a xref link; + -- if there's no such attribute, employ some heuristics based on what + -- docbook-xsl does. + xrefTitleByElem el + | not (null xrefLabel) = xrefLabel + | otherwise = case qName (elName el) of + "chapter" -> descendantContent "title" el + "section" -> descendantContent "title" el + "sect1" -> descendantContent "title" el + "sect2" -> descendantContent "title" el + "sect3" -> descendantContent "title" el + "sect4" -> descendantContent "title" el + "sect5" -> descendantContent "title" el + "cmdsynopsis" -> descendantContent "command" el + "funcsynopsis" -> descendantContent "function" el + _ -> qName (elName el) ++ "_title" + where + xrefLabel = attrValue "xreflabel" el + descendantContent name = maybe "???" strContent + . filterElementName (\n -> qName n == name) + +-- | Extract a math equation from an element +-- +-- asciidoc can generate Latex math in CDATA sections. +-- +-- Note that if some MathML can't be parsed it is silently ignored! +equation + :: Monad m + => Element + -- ^ The element from which to extract a mathematical equation + -> (String -> Inlines) + -- ^ A constructor for some Inlines, taking the TeX code as input + -> m Inlines +equation e constructor = + return $ mconcat $ map constructor $ mathMLEquations ++ latexEquations + where + mathMLEquations :: [String] + mathMLEquations = map writeTeX $ rights $ readMath + (\x -> qName (elName x) == "math" && qPrefix (elName x) == Just "mml") + (readMathML . showElement) + + latexEquations :: [String] + latexEquations = readMath (\x -> qName (elName x) == "mathphrase") + (concat . fmap showVerbatimCData . elContent) + + readMath :: (Element -> Bool) -> (Element -> b) -> [b] + readMath childPredicate fromElement = + ( map (fromElement . everywhere (mkT removePrefix)) + $ filterChildren childPredicate e + ) + +-- | Get the actual text stored in a verbatim CData block. 'showContent' +-- returns the text still surrounded by the [[CDATA]] tags. +-- +-- Returns 'showContent' if this is not a verbatim CData +showVerbatimCData :: Content -> String +showVerbatimCData (Text (CData CDataVerbatim d _)) = d +showVerbatimCData c = showContent c + +-- | Set the prefix of a name to 'Nothing' +removePrefix :: QName -> QName +removePrefix elname = elname { qPrefix = Nothing } diff --git a/documentation-tools/doc-converter/remarkup.lua b/documentation-tools/doc-converter/remarkup.lua new file mode 100644 index 0000000..c907592 --- /dev/null +++ b/documentation-tools/doc-converter/remarkup.lua @@ -0,0 +1,323 @@ +-- (WIP) Pandoc Remarkup Writer +-- +-- Invoke with: pandoc -t remarkup.lua +-- +-- Note: you need not have lua installed on your system to use this +-- custom writer. However, if you do have lua installed, you can +-- use it to test changes to the script. 'lua remarkup.lua' will +-- produce informative error messages if your code contains +-- syntax errors. + +local function hasArg (attrs, a) + for _, i in pairs(attrs) do + if i == a then + return true + end + end + return false +end + +local hex_to_char = function(x) + return string.char(tonumber(x, 16)) +end + +local url_unescape = function(url) + return url:gsub("%%(%x%x)", hex_to_char) +end + +local function escape(s, in_attribute) + return s +end + +-- Helper function to convert an attributes table into +-- a string that can be put into HTML tags. +local function attributes(attr) + local attr_table = {} + for x,y in pairs(attr) do + if y and y ~= "" then + table.insert(attr_table, ' ' .. x .. '="' .. escape(y,true) .. '"') + end + end + return table.concat(attr_table) +end + +-- Run cmd on a temporary file containing inp and return result. +local function pipe(cmd, inp) + local tmp = os.tmpname() + local tmph = io.open(tmp, "w") + tmph:write(inp) + tmph:close() + local outh = io.popen(cmd .. " " .. tmp,"r") + local result = outh:read("*all") + outh:close() + os.remove(tmp) + return result +end + +-- Table to store footnotes, so they can be included at the end. +local notes = {} + +-- Blocksep is used to separate block elements. +function Blocksep() + return "\n\n" +end + +-- This function is called once for the whole document. Parameters: +-- body is a string, metadata is a table, variables is a table. +-- This gives you a fragment. You could use the metadata table to +-- fill variables in a custom lua template. Or, pass `--template=...` +-- to pandoc, and pandoc will add do the template processing as +-- usual. +function Doc(body, metadata, variables) + return body + -- local buffer = {} + -- local function add(s) + -- table.insert(buffer, s) + -- end + -- add(body) + -- if #notes > 0 then + -- add('\n') + -- for _,note in pairs(notes) do + -- add(note) + -- end + -- end + -- return table.concat(buffer,'\n') +end + +-- The functions that follow render corresponding pandoc elements. +-- s is always a string, attr is always a table of attributes, and +-- items is always an array of strings (the items in a list). +-- Comments indicate the types of other variables. + +function Str(s) + return escape(s) +end + +function Space() + return " " +end + +function LineBreak() + return "\n" +end + +function SoftBreak() + return " " +end + +function Emph(s) + return "//" .. s .. "//" +end + +function Strong(s) + return "**" .. s .. "**" +end + +function Subscript(s) + return "_" .. s .. "" +end + +function Superscript(s) + return "^" .. s .. "" +end + +function SmallCaps(s) + return s +end + +function Strikeout(s) + return '~~' .. s .. '~~' +end + +function Link(s, src, tit) + if s == nil or s == '' then + return "[[" .. url_unescape(src) .. "]]" + else + return "[[" .. url_unescape(src) .. "|" .. s .. "]]" + end +end + +function Image(s, src, tit) + return "IMAGE: " .. Link(s, src, tit) +end + +function RawInline(lang, s, attr) + return "`" .. s .. "`" +end + +function Code(s, attr) + return "`" .. s .. "`" +end + +function InlineMath(s) + return "`" .. s .. "`" +end + +function DisplayMath(s) + return "`" .. s .. "`" +end + +function Note(s) + local num = #notes + 1 + -- insert the back reference right before the final closing tag. + s = string.gsub(s, + '(.*)' .. s .. '') + -- return the footnote reference, linked to the note. + return '' .. num .. '' +end + +function Span(s, attr) + return s +end + +function Cite(s, cs) + return s +end + +function Plain(s) + return s +end + +function Para(s) + return s +end + +-- lev is an integer, the header level. +function Header(lev, s, attr) + local marker = string.rep("=", lev) + return "" .. marker .. " " .. s .. " " .. marker +end + +function BlockQuote(s) + local new_s = string.gsub(s, "\n", "\n> ") + return "> " .. new_s +end + +function HorizontalRule() + return string.rep("-", 5) +end + +function RawBlock(lang, s, attr) + return "```lang=" .. lang .. "\n" .. s .. "\n```" +end + +function CodeBlock(s, attr) + if (hasArg(attr, "synopsis")) then + return "(SYNOPSIS)\n" .. s .. "\n" + end + + return "```\n" .. s .. "\n```" +end + +function BulletList(items) + local buffer = {} + for _, item in pairs(items) do + table.insert(buffer, "* " .. item) + end + return table.concat(buffer, "\n") +end + +function OrderedList(items) + local buffer = {} + for _, item in pairs(items) do + table.insert(buffer, "# " .. item) + end + return table.concat(buffer, "\n") +end + +-- Revisit association list STackValue instance. +function DefinitionList(items) + local buffer = {} + for _,item in pairs(items) do + for k, v in pairs(item) do + table.insert(buffer,"
" .. k .. "
\n
" .. + table.concat(v,"
\n
") .. "
") + end + end + return "
\n" .. table.concat(buffer, "\n") .. "\n
" +end + +-- Convert pandoc alignment to something HTML can use. +-- align is AlignLeft, AlignRight, AlignCenter, or AlignDefault. +function html_align(align) + if align == 'AlignLeft' then + return 'left' + elseif align == 'AlignRight' then + return 'right' + elseif align == 'AlignCenter' then + return 'center' + else + return 'left' + end +end + +-- Caption is a string, aligns is an array of strings, +-- widths is an array of floats, headers is an array of +-- strings, rows is an array of arrays of strings. +function Table(caption, aligns, widths, headers, rows) + local buffer = {} + local function add(s) + table.insert(buffer, s) + end + add("") + if caption ~= "" then + add("") + end + local header_row = {} + local empty_header = true + for i, h in pairs(headers) do + table.insert(header_row,'') + empty_header = empty_header and h == "" + end + if empty_header then + head = "" + else + add('') + for _,h in pairs(header_row) do + add(h) + end + add('') + end + for _, row in pairs(rows) do + add('') + for i,c in pairs(row) do + add('') + end + add('') + end + add('
" .. caption .. "
' .. h .. '
' .. c .. '
') + if caption ~= "" then + add("//" .. caption .. "//") + end + return table.concat(buffer,'\n') +end + +function Div(s, attr) + if(hasArg(attr, "note")) then + return "NOTE:".. s .. "\n" + end + + if(hasArg(attr, "warning")) then + return "WARNING:".. s .. "\n" + end + + if(hasArg(attr, "important")) then + return "IMPORTANT:".. s .. "\n" + end + + return "\n" .. s .. "" +end + +-- The following code will produce runtime warnings when you haven't defined +-- all of the functions you need for the custom writer, so it's useful +-- to include when you're working on a writer. +local meta = {} +meta.__index = + function(_, key) + io.stderr:write(string.format("WARNING: Undefined function '%s'\n",key)) + return function() return "" end + end +setmetatable(_G, meta) diff --git a/documentation/Concepts/containers.xml b/documentation/Concepts/containers.xml index c6e8384..eea9377 100644 --- a/documentation/Concepts/containers.xml +++ b/documentation/Concepts/containers.xml @@ -1,336 +1,339 @@ <?xxe-sn 2ahi4rjnvuo 2i?>Containers
<?xxe-sn 2alsjvonojk 1h?>Syntax Container operations: access(serial) Denotes operation's type as a sequential access. More... access(rand) Denotes operation's type as a random access. More... Container implementations: container(onthefly) Lazy data structure that generates elements on the fly when needed. More... container(solid) Array or data structure that occupies contiguous memory region More...
<?xxe-sn 2ahi4rjnvuo 2k?>Introduction Containers is a general term referring to the data structures that contain a group of elements of certain type. Considering that virtually every program use containers to store, retrieve, search or otherwise process aggregate data, obviously efficiency of containers implementation is a priority for Xreate design. There are many different data structures invented to serve as a containers, each of them having different characteristics and peculiarities with no obvious winner but rather each one suits the best in appropriate situations. Usually it is software developer's knowledge and responsibility to be able to select the most appropriate container's implementation for a particular use case. That said, Xreate goes further and gathers information on how containers are used, by analysing the program sources. On this ground it's possible to choose semi-automatically the most appropriate data structure for container implementation to efficiently fulfil particular needs in a particular situation. In order to do this, the following approach is used. The way a container is defined is associated with one or more possible implementations it supports. On the other side, operations over container demand certain "good" implementations to efficiently process container data. Viewing it as supply and demand setting, with a list of several acceptable implementations from either side, the most appropriate tradeoff is chosen as implementation for a given container to fulfill both sides with regard to defaults, preferences, constraints and other ways to guide inference process. In short example below name = "tests/containers.cpp: Containers.Doc_Intr_1" //container definition: a= {1, 2, 3, 4, 5}:: [num]; container(solid). //container's operation x= a[0]:: num; access(rand). container's offer container(solid) and operation's demand access(rand) are explicitly expressed by annotations for clarity purposes. Annotation container(solid) depicts that container a supports implementation solid, that is plain contiguous memory region or array. On the other side, annotation access(rand) expresses nature of retrieving by index operation (variable x) and it requires selected data structure to support random access to be efficiently executed. Obviously, implementation solid allows efficient random access and so it's assumed as a container a's implementation by inference process. Semi-automatic, guided container's implementation selection has some advantages, such as: Less manual work. Inferring adequate implementations with little to no additional input from developer saves time to concentrate on more important aspects. Other approach to achieve the same, namely to assign default one-size-fits-all implementation with an average performance, is simpler but can not compete with more careful strategy. Rapid development and optimization antagonism. It is important to keep in mind, that rapid development and frequent code changes somewhat contradict optimization efforts. Each round of optimization is leveraged by relying on concrete and particular program properties, overt or hidden connections and other observations. Once program undergo further development most of the previously sound optimization techniques became obsolete, irrelevant or plainly wrong. Selecting(as often as needed) the most efficient data structures keeps reasonable efficiency level automatically and does not impede possibly fast development pace. Regression resistance. Xreate encourages frequent changes, adjusting and recombination in software components, libraries and modules by automatically reevaluating and reassigning most appropriate data structures in the new conditions or signalling error if it's impossible. This somewhat alleviates problem of fragile software and gives more confidence for refactoring.
<?xxe-sn 2ahi4rjnvuo 33?>Container Implementations Xreate supports container implementations presented below:
<?xxe-sn 2ahi4rjnvuo 5h?>Implementation 'container(onthefly)' Source: range list operator [from .. to]. Supported operations: access(serial). This is elementary implementation that represents lazy data structure — sequence of elements is generated by recurrence equation applied to a current element to compute next element of the sequence. It does not keep actual data in the memory, but instead computes necessary elements when accessed. This kind of implementation is rather memory - efficient since occupied memory does not depend on count of - elements. + efficient since occupied memory does not depend on container's + size. For example, range list [1..10] supports onthefly implementation by using internally recurrent function x[i+1] = x[i] + 1, 1<= x <= 10, that generates successive element x[i+1] given x[i]. Recurrent elements generation is suited for sequential access and can't serve random access operations.
<?xxe-sn 2ahi4rjnvuo 5r?>Implementation 'container(solid)' Source: list operator. Supported operations: access(serial), access(rand). This is implementation from the opposite side of the memory/computation space. It stores all the container's data in memory occupying contiguous region, known as array. As opposed to the implementation onthefly, it's computationally efficient for there is no need for any additional computations apart from simple offset calculation to get an element requested by an index. Due to the fact that all elements are present in the memory, the implementation supports sequential access as well as random access operations.
<?xxe-sn 2ahi4rjnvuo 3g?>Container Operations In order to describe requirements for a container all the operations are broken down into several categories as presented below.
<?xxe-sn 2alsjvonojk 2r?>Operation 'access(serial)' Operators: loop map, loop fold. Annotation denotes sequential access operation, such as map loop or map fold. Example: name = "tests/containers.cpp: Containers.Doc_OpAccessSeq_1" import raw("scripts/containers/containers.lp"). test = function :: int; entry { range = [1..5] :: [int]; container(onthefly). sum = loop fold(range->el:: int, 0->acc):: [int]; access(serial) { acc + el }. sum }
<?xxe-sn 2alsjvonojk 2u?>Operation 'access(rand)' Operators: index Annotation denotes random access operation. Example: name = "tests/containers.cpp: Containers.Doc_OpAccessRand_1" import raw("scripts/containers/containers.lp"). test = function:: num; entry { a = {1, 2, 3, 4, 5}:: [num]; container(solid). a[1]:: num; access(rand) }
<?xxe-sn 2apiasqubk0 6?>AST Attachments In order to bypass tedious writing of necessary annotations for each line of code that works with containers there are appropriate annotations already defined for common operations. All it takes for a client's code is to include transcend script scripts/dfa/ast-attachments.lp that allows to assign predefined annotations for syntactic constructs and operators. Example below includes ast-attachments.lp that feeds compiler with default annotations sparing developer of - specifying them manually. + specifying them manually. name = "tests/containers.cpp: Containers.Doc_ASTAttach_1" import raw("scripts/containers/containers.lp"). import raw("scripts/dfa/ast-attachments.lp"). test = function :: int; entry { range = [1..5] :: [int]. sum = loop fold(range->el:: int, 0->acc):: [int] { acc + el }. sum }
+ \ No newline at end of file diff --git a/documentation/communication.xml b/documentation/communication.xml index f4293f6..1547364 100644 --- a/documentation/communication.xml +++ b/documentation/communication.xml @@ -1,440 +1,443 @@ <?xxe-sn 29tvny21340 2?>Communication The chapter discusses safe usage of non-local variables, that is variables accessible by different components or threads with global variables as a particular case.
<?xxe-sn 29xq7jt0wzk 2?>Syntax Annotations: SYNTAX: **commop(send)** (1) **commop(receive)** (2) annotation (1) marks SEND communication event. annotation (2) marks RECEIVE communication event. Specializations: SYNTAX: **commDirect** **commGuarded** Communication reasoning able to assign following specializations: commDirect — specialization is expected to provide direct access to raw variable's content. commGaurded — specialization is expected to do internal consistency checks at run time.
<?xxe-sn 29tvny21340 4?>Background One of the major concepts that support writing of safe programs is a notion of immutability. Immutability tremendously simplifies many kinds of analyses; using immutable structures is a practical way to write multithreaded applications and has many other benefits beyond that. However in its most basic form it comes with a price of disastrous, in many cases, memory overhead, since property of immutability stipulates for each change of variable to make an independent copy of it occupying different memory region. Unwise using of immutable structures lead to the situation such that CPU is mostly occupied with unnecessary variables copying to and fro as well as with extensive garbage collection, irrelevant of actual algorithm's complexity at hand. Thus it is one of the central highlights of proper programming language design to provide techniques to overcome the shortcomings by relaxing immutability requirements keeping nevertheless safety benefits. There are many ways to approach the problem, and one such technique, namely communication model is discussed next.
<?xxe-sn 29xq7jt0wzk 6?>Communication Model Communication model is a way to capture and express what's going on with variables in a program as well as to define rules that describe valid operations over variables. Within the framework writing value to a variable is viewed as sending, and conversely reading variable's value is viewed as receiving. Variables that are accessed from different components or threads are referred to as non-local variables. This chapter is focused on a on-local variables, global variables particularly, since exactly for them it's hard to manually check exhaustively where and how they are used in order to catch any errors. It is natural to view them as the means of interaction between different parts of a program, in other words, interaction between sender and receiver, where sender and receiver are different components. The same terms comprise rules that express valid ways of interacting. The abstraction is named communication model due to similarity with the network communication. Reasoning based on working with a communication path, i.e chain of communication events(e.g. sending/receiving) occurred during program execution. Let's consider small example: a = init():: int; commop(send). //(1) b = a + 1 :: int; commop(receive). //(2) It shows computing of variable b. Variable b depends on a so a is calculated first. Variables a, b are annotated with comm(send) and comm(receive), denoting sending and receiving events, respectively. Communication path in this case is an ordered list {<begin>, SEND, RECEIVE, <end>} where <begin>, <end> — are special events that denote first and last events in the path, respectively. The gist of using communication model is to ensure that every sent value is properly received. It relies on the compiler to gather all possible communication paths in the program as an input for processing. There are two supported modes of reasoning: Validation. In this mode all communication paths are checked against communication rules to confirm that the program is valid. Otherwise compilation error is raised. Planning. In this mode reasoning assigns proper implementation for variables in efforts to ensure validity.
<?xxe-sn 29xq7jt0wzk k?>Validation To perform validation, every communication path is checked against number of communication rules that express which communication path are valid. Default behaviour expressed by "every sent value being properly received" produce next possible cases: Valid. Path that consists of pairs of events {SEND, RECEIVE} are valid meaning that each sent value is properly received. Undefined and expired value. Paths that have parts {<begin>, RECEIVE} or {RECEIVE, RECEIVE} are invalid meaning possibly undefined value is received in the first case or duplication i.e. expired value is used in the second's one. Lost value. Paths that have parts {SEND, SEND} or {SEND, <end>} indicate possibly lost change since consequent sender replaces value in the former case and sent value is not used at all in the latter case. Traditional immutability validation is based on the idea that once valid value is valid as long it is unmodified. In this regards communication model can be viewed as an extension and more expressive tool since it also captures value expiration after it was used as well as value loss, if it was not used at all.
<?xxe-sn 29xq7jt0wzk 2a?>Planning Reasoning in the communication model aside of performing validation, also assigns appropriate specialization for sending and receiving operations, as appropriate. At the moment there are two specializations the operations are expected to support: Direct. Direct specialization commDirect is expected to provide direct access to variable's value. This specialization is assigned in case of fully statically validated communication path. Guarded. In case if there are possible communication path inconsistencies that can not be completely ruled out at compile time, checking logic should be embedded into compiled code. Specialization commGaurded is expected to hold variable state and check usage consistency.
<?xxe-sn 2a3uy8rr2f4 9?>Planning Horizon Reasoning implements algorithm that is bounded by the maximal path length it can process. The parameter is called planning horizon. Any variable that it can not check due to exceedingly large path's length is assigned default implementation commGaurded that performs necessary checks during runtime. Thus the parameter regulates trade off between static analysis extensiveness and runtime checks overhead.
<?xxe-sn 2a7t1hxqqyo 2?>Example: Direct Implementation name="tests/effects-communication.cpp: Doc_DirImpl", lines=15 import raw("scripts/dfa/propagation.lp"). import raw("scripts/dfa/polymorphism.lp"). import raw("scripts/effects-communication/communication.lp"). import raw("scripts/effects-communication/config.lp"). CommDirect = type { value:: int }. guard:: commDirect { init = function::CommDirect { {value = 0} } read = function(vault1:: CommDirect):: int { (vault1:: *;commop(receive))["value"] } write = function(vault2:: CommDirect, valueNew:: int)::CommDirect { (vault2:: *; dfa_pseudo(vault2)) + {value = valueNew}:: int; commop(send); dfa_uppy(vault2) } } main = function::int; entry { x1 = init()::*; dfa_polym(ret). x2 = write(x1, 1)::*; dfa_polym(arg). val = read(x2)::int; dfa_polym(arg). val } In this example, basic workflow is presented in main — the function write(x1, 1) is invoked following by invocation of read(x2). Functions write() and read() are annotated with commop(send) and commop(receive) respectively in order to enable communication reasoning. Analyzer gathers and validates observed communication path and since there is no ambiguity, it's possible to assign specialization CommDirect allowing direct access to the variables avoiding any additional overhead. Note, there are no any other specializations defined and if reasoning was not enable to conclude that it is the case the compilation error would be raised.
<?xxe-sn 2a7t1hxqqyo a?>Example: Guarded Implementation name="tests/effects-communication.cpp: Doc_GuardedImpl", lines=15 import raw ("scripts/effects-communication/communication.lp"). import raw ("scripts/dfa/propagation.lp"). import raw ("scripts/dfa/polymorphism.lp"). import raw ("scripts/effects-communication/config.lp"). CommState = type variant{Invalid, Valid, Outdated}. CommDirect = type { value:: int }. CommGuarded = type { value:: int, state:: CommState }. guard:: commDirect { init=function::CommDirect{ {value = 0} } read= function(vault1:: CommDirect):: int{ (vault1::CommDirect; commop(receive))["value"] } write= function(vault2:: CommDirect, valueNew1:: int)::CommDirect{ (vault2::CommDirect;dfa_pseudo(vault2)) + {value = valueNew1}:: int; commop(send); dfa_uppy(vault2) } } errorRead = function:: int { -1 } errorWrite = function:: CommGuarded{ { value = -1, state = Invalid() } } guard:: commGuarded{ init=function::CommGuarded{ { value = 0, state = Invalid() } } read=function(vault3:: CommGuarded):: int { switch variant (vault3["state"]->whatever::CommState;commop(receive)):: int case (Invalid) { errorRead() } case (Outdated) { errorRead() } case (Valid) { vault3["value"] } } write=function(vault4:: CommGuarded, valueNew2:: int)::CommGuarded{ switch variant (vault4["state"]->whatever::CommState;commop(send); dfa_pseudo(vault4))::int case (Invalid) { {value = valueNew2, state = Valid()}:: CommGuarded; dfa_uppy(vault4) } case (Outdated) { {value = valueNew2, state = Valid()}:: CommGuarded; dfa_uppy(vault4) } case (Valid) { errorWrite():: CommGuarded; dfa_uppy(vault4) } } } main=function(cmd:: num)::int; entry { x1 = init():: *; dfa_polym(ret). x2 = write(x1, 1)::*; dfa_polym(arg). x3 = if (cmd > 0)::int { y = read(x2):: int; dfa_polym(arg). y } else { z = write(x2, 2)::*; dfa_polym(arg). a = read(z):: int; dfa_polym(arg). a }. x3 } Here example of slightly more complicated workflow. Function main contains branching that depends on argument known at run time only. Analyzer is presented with two possible communication paths and one of them(false branch) leads to a possibly lost value for it contains two consequent SEND events. In this situation the analyzer unable to statically validate correctness and assigns specialization commGuarded to embed checking logic into compiled code as an intermediary layer between variable's content and client's code. Implementation commGuarded along with a variable access also tracks the variable status and returns error if the value is inconsistent.
+ \ No newline at end of file diff --git a/documentation/exploitation.xml b/documentation/exploitation.xml index 984d077..39bd002 100644 --- a/documentation/exploitation.xml +++ b/documentation/exploitation.xml @@ -1,488 +1,490 @@ <?xxe-sn 26yv439af40 2?>Exploitation This chapter discusses exploiting external resources, such as files, as a particular instance of a side effects problem that inevitably stems from an interaction with the outside world. Unlike virtualization, an another documentation's topic that tackles I/O, exploitation approaches subject from a different angle — it is concerned with an order of operations, sequence in which different clients jointly use the same resource and it deals with corresponding difficulties, e.g. ensures proper resource initialization before actual usage.
<?xxe-sn 29je46abuev -wunr7fl0rw8u?>Syntax
<?xxe-sn 29je46abuev -wunr7fl0rw8r?>Annotations SYNTAX: **use**(//resource-id//) **init**(//resource-id//) resource-id — user-defined resource identifier Annotates function or code block as such that exploits resource resource-id.
<?xxe-sn 29je46abuev -wunr7fl0rw8i?>Guards SYNTAX: **exploitation(init)** **exploitation(none)** Specializations that are recognized by exploitation reasoning. Each specialization corresponds to an initialization strategy: exploitation(init) is expected to perform actual resource initialization. exploitation(none) is expected to do nothing as initialization isn't necessary or done elsewhere.
<?xxe-sn 26yv439af40 4?>Background In software engineering, the idea to avoid side effects have received considerable traction. Indeed, side effects is something that is hard to take into account and thus programs that have side effects are inherently unsafe, thus best coding practices are rightfully suggest to isolate side effects producing code as much as possible. It's so called pure functional languages whose philosophy goes even further and frames side effects as something opposite of "pure", and everything is built around effectless computations to the point that some languages' design itself includes side effects producing constructs, such as I/O, as an afterthought, as something almost unnecessary. However, in reality the opposite is true, most applications' sole responsibility is to communicate with "outside world", reacting to the external events and change "world state" accordingly. As a consequence, side effects usually are the only important effects the program produce and surely deserve first class support from a programming language and justify efforts to develop approach to alleviate related safety and performance concerns.
<?xxe-sn 26yv439af40 10?>Exploitation Plan One complexity of taking side effects into account is the fact that final result depends on an exact operations order. This harshly impacts both performance and safety, for many techniques, e.g. caching, parallelization can neither be automatically performed nor validated since they are based on various degrees of reordering or deal with possibly undetermined beforehand order of execution. In this chapter, it is assumed, that final effects of execution fully defined by exploitation path — for a particular code path that can occur during execution, it is its part consisting of only relevant code blocks., i.e. those that deal with an exploited resource. Other code blocks do not influence exploitation effects and so are excluded from consideration. Thus reasoning about effects is reduced to considering all possible exploitation paths, checking do they meet certain requirements that define valid exploitation and making corrections if needed and possible. Result of the reasoning is called exploitation plan — specification that defines exact order and strategy of using a given resource in order to comply with imposed requirements. With all above said, the discussed approach can be presented as follows: Annotations are used to express some aspects of side effects to enable further reasoning. They indicate code blocks that deal with resource as well as provide additional information about how exactly it is exploited, e.g. use, initialize or deinitialize resource. Existing code paths, extracted during source code processing, coupled with relevant annotations is enough to construct all possible exploitation paths and analyze them. Analysis determines possible(weak) paths that can occur or not during particular execution as well as certain paths(strong) that occur always no matter what. Also it checks are exploitation paths valid against certain rules, e.g. initialization occurs always before actual usage and is it possible to correct invalid paths. Reasoning's result is an exploitation plan that dictates order and strategy of exploitation is presented in form of appropriate specialization for polymorphic functions that deal with resources in order to ensure safe exploitation to the extent based on provided annotations. Exploitation related side effects are viewed as a set of additional restrictions over operations order. Only subset of possible reorders is still valid w.r.t. side effects. Transcend's task is to find out refined set of valid orders. Thus techniques that rely on reordering enjoy additional information to make safe optimizations. ... and it serves three major goals: Safety. Validates existing exploitation plan or is it possible to safely exploit given resource at all. Compiler signals error if a given exploitation plan is invalid, i.e. does not satisfy requirements w.r.t. side effects as expressed by annotations. Regression Resilience. When it comes to using external resources, some spurious dependencies usually occur between otherwise isolated, independent components of a program. Sometimes refactoring and other code changes break those dependencies inevitably introducing regressions. Exploitation catches this sort of regressions and automatically regenerates exploitation plan suited for a changed conditions. Performance. Generated exploitation plans are optimal in a sense that they cut off superfluous operations, for example, removing resource initialization in several places if it can be done safely in a single one, thus reducing overall overhead.
<?xxe-sn 27ay8x1a5mo 2?>Domination Analysis When it comes to a reasoning about order of execution flow and possible code paths, crucial vehicle for that is domination analysis producing dominator tree as an output. Unlike the usual function-bounded domination analysis, when separate domination tree is produced for each function defined in a program, Exploitation requires program bound analysis, that is to take into account control flow across all functions in a program. It is computationally intensive task to perform analysis over a whole program, however it is compensated by the fact that Exploitation only takes into account code blocks that deal with, or in other words, exploit external resources. Thus there is no necessity to build full dominator tree, only the relevant parts are constructed, just enough to make sound exploitation plan decisions.
<?xxe-sn 28h47d43thc j?>Empty Exploitation Plan. Effect Free Computations Validation of exploitation path is done against some predefined constraints. Depending on complexity of a constraints, i.e. number of different exploitation events that are seeking for in each path, reasoning goals categorized into several groups: Zero Order Exploitation. Meaning that all paths are checked in terms is there exploitation at all or no, is there at least a single exploitation event along the path. First Order Exploitation. Deals with a situations when it's enough to check only two different exploitation event occur in a required order. It can be useful for example, to check whether all resource uses occur after it is initialized. Higher Order Exploitation. Expresses constraints involving several(more than two) exploitation events and relations between them. Empty Exploitation is an important instance of zero order constraint. It useful mechanism for developer to annotate function or part of a program as effect free in terms of exploitation. Thus, efectless, clean or - pure code can be clearly separated from effectfull part and compiler - raises compilation error in case of accidental mixing or using "wrong" - type of code in non appropriate environment. + pure code can be clearly separated from effectful part and compiler raises + compilation error in case of accidental mixing or using "wrong" type of + code in non appropriate environment.
<?xxe-sn 26yv439af40 v?>Resource Initialization One important problem related to an exploitation order is to ensure that a given resource is properly initialized before its first usage and additionally it is not initialized more then once during exploitation session. This is instance of first order exploitation since in a validation mode it is enough to check exploitation plan to ensure that every resource usage preceded by resource initialization at some point in the past, i.e. previously in the exploitation path. For planning mode, the problem is addressed as follows: Central idea of the algorithm is to consider candidates for initialization only among code blocks that dominate given usage site. Obviously, initialization in dominating block precedes usage for any possible code path. One or more dominator blocks are chosen for actual initialization in such way that they are cover all found usage sites. For code blocks chosen for initialization specialization exploitation(init) is set, for the rest specialization exploitation(none) is used. Look at the example below: name="tests/exploitation.cpp: Doc_ResourceInit_1", lines=15 import raw("scripts/cfa/payload.lp"). import raw("scripts/exploitation/exploitation.lp"). //exploitation reasoning import raw("scripts/exploitation/test1.assembly.lp"). guard:: exploitation(init) { openFile = function(filePrev:: FILE_P):: FILE_P; init(file) { fopen("/tmp/test", "w")::FILE_P } } guard:: exploitation(none) { openFile = function(filePrev:: FILE_P):: FILE_P { filePrev::int } } test = function:: int; entry { seq { f0 = undef:: FILE_P. f0 } { //Scope #1: f1 = openFile(f0):: FILE_P. f1 } { //Scope #2: f2 = openFile(f1):: FILE_P. f2 } { //Scope #3: sizeWritten = fwrite("Attempt to write..", 12, 1, f2):: int; use(file). sizeWritten } { //Scope #4: fclose(f2):: int; use(file) } { sizeWritten :: int} } There is the function test that executes sequentially next commands: open a file(scopes #1, #2), write some text(scope #3) and finally, close the file(scope #4). It represents simple work flow with an external resource. In order to connect the code to the exploitation the functions fwrite and fclose in scopes #3 and #4 respectively are annotated with annotation use(file). This information is used by reasoning to look whether it is possible to initialize given resource before actual usage as well as where and when exactly to initialize it. Function openFile is annotated as init(file) meaning it can initialize depending on chosen strategy. The function is invoked both in scope #1 and scope #2. Both scopes are executed strictly before scopes #3, #4. Thus it is indeed possible to initialize resource before usage. Next task for exploitation is to choose correct exploitation plan, i.e. to assign strategies for all possible initialization places in the effort to initialize resource only once. Here, it means that only one invocation of openFile is assigned with exploitation(init) to actually initialize the file. Other one is automatically marked with exploitation(none) to invoke different specialization of openFile - that does nothing since the files is already initialized. + that does nothing since the file is already initialized.
\ No newline at end of file diff --git a/documentation/virtualization.xml b/documentation/virtualization.xml index 8876674..fea27c3 100644 --- a/documentation/virtualization.xml +++ b/documentation/virtualization.xml @@ -1,497 +1,541 @@ <?xxe-sn 26yv439af40 1l?>Virtualization - The chapter expands on a usage of context + The chapter expands on the usage of context based polymorphism, in other words, reasoning over CFG, as a ground to implement application level virtualization. - One way to approach virtualization problem is - to model it in terms of satisfying constraints imposed by environment over - agents that operate within it, as presented below: + Желательно расшифровать + аббревиатуру CFG, до этого она нигде не встречалась, и мне не ясно что она + означает + + One way to approach the virtualization + problem is to model it in terms of satisfying constraints imposed by + environment over agents that operate within it, as presented below: sizo - (distorted abbreviation of SEcurity ZOne) — logical entity introduced to - represent environment and describe desired virtualization + (distorted abbreviation for SEcurity ZOne): logical entity introduced to + represent an environment and describe the desired virtualization outcome. zek - (distorted abbreviation of SEcurity aGent) — represents behaviour of the + (distorted abbreviation for SEcurity aGent): represents behaviour of the code in terms of virtualized resources access. + + По поводу длинных тире: + насколько я знаю, они практически не используются в английском языке + (как и наши «типографские» кавычки). Только короткие. Но эту информацию + нужно проверить (PS: сегодня читал книгу на английском, и таки увидел + там длинные тире). Basic idea is to automatically reason over information defined by sizos and zeks and produce virtualization plan as a solution that dictates which parts of code should be virtualized and how exactly. Such reasoning enables two features: Optimization. Allows choosing - virtualization technique with a least performance penalty nevertheless - satisfying necessary requirements. + virtualization technique with the smallest performance penalty + nevertheless satisfying necessary requirements. Safety. Validates manually chosen - virtualization plan to be safe and sound by checking that it indeed - satisfies requirements. + virtualization plan to ensure it is solid and operable by checking that + it indeed satisfies requirements. + + Я бы вообще сократил: "to + ensure it is solid, operable and satisfying the necessary + requirements" In other words, context based reasoning - provides improvements by virtualizing only necessary sections of code, only - for necessary type of resources and by employing as lightweight as possible - virtualization strategy just enough to comply with safety and security - requirements expressed by annotations in the code. + provides improvements by virtualizing only necessary sections of code and + only for necessary type of resources by employing as lightweight + virtualization strategy as possible – just enough to comply with the safety + and security requirements expressed by annotations in the code.
<?xxe-sn 26yv439af40 25?>Background Virtualization - refers to an abstracting code from underlying resources - used by it. Here term resource depicts any external entity such as - devices, files, network connections, etc, for which it is desirable to + refers to abstracting code from the underlying resources + used by it. Here the term 'resource' depicts any external entity such as a + device, file, network connection etc, for which it is desirable to regulate access. Virtualization is a vast area and broad term that includes number of techniques on different levels to achieve several important goals such as: - Shared access — to allow several - clients use the same resource while behaving as if each client is the - sole resource user simplifying development and testing. + Shared access — allowing several + clients to use the same resource while behaving as if each client were + sole resource user, to simplify development and testing. Isolation — cornerstone of safety and behaviour repeatability achieved by minimizing influence of isolated clients between each other and external environment. - Adaptation — to allow client - application work within an unexpected environment it was not developed + Adaptation — allowing client + application work in an unexpected environment it was not developed for, by emulating "native" familiar environment thus reducing adaptation and support costs. - Due to importance of goals achievable with - virtualization, it is unavoidable in a long run. That being said, basic - virtualization techniques have performance penalties arising from indirect - and regulated access to underlying resources. + Due to the importance of goals achievable + with virtualization, it is unavoidable in a long run. That being said, + basic virtualization techniques have performance penalties arising from + indirect and regulated access to underlying resources. Further discussion is concerned with what can be done to alleviate major virtualization inefficiencies by - fine-grained control over what, when, and how should be - virtualized. + fine-grained control over what should be virtualized, when, and + how.
<?xxe-sn 26yv439af40 2k?>Access Control - Whole program can be broken down into one + A whole program can be broken down into one or more virtualization zones, each having different appropriate type of virtualization strategy. Such approach allows to model hybrid - virtualization, i.e. different parts of program are virtualized - differently depending on some conditions. To capture this concept, term - sizo is introduced and - refers to a logical entity that holds information about particular zone - necessary to find best suited virtualization strategy. + virtualization, i.e. different parts of a program are + virtualized differently depending on certain conditions. To capture this + concept, the term sizo is + introduced, that refers to a logical entity that holds information about a + particular zone necessary to find best suited virtualization + strategy. Sizo is associated with context, i.e. each zone spans over one or more code blocks. - There is a annotation assign_sizo + There is an annotation assign_sizo to specify sizo a code block is assigned to: SYNTAX: **assign_sizo**(//sizo-ref//). sizo-ref unique sizo's identifier Next thing is to specify which resources a - particular sizo controls access to, as below: + particular sizo controls access to, as demonstrated below: SYNTAX: **assign_sizo_control**(//resource-type//). - It indicates that current sizo(sizo that - spans over code block the annotation located within) regulates all access - to a resources of a given type resource-type. - Conversely, If for a particular environment there is no need to control, - for example, file system access, no virtualization for file operations is + It indicates that the current sizo (sizo + that spans over the code block wherein the annotation is located) + regulates all access to resources of a given type, resource-type. + Conversely, if for a particular environment there is no need to control + e.g. file system access, no virtualization for file operations is applied. On the other hand, there is an annotation - to mark function that accesses one or another resource: + to mark a function that accesses one or another resource: SYNTAX: **assign_zek_access**(//resource-type//). Let's consider an example to demonstrate - all above: + all the above: name="tests/virtualization.cpp: Virtualization.Doc_AccControl_1", lines=15 import raw ("scripts/cfa/context.lp"). //enable context reasoning import raw ("scripts/virtualization/virtualization.lp"). //enable virtualization reasoning guard:: strategy(direct) { openFile = function(filename::string):: int { printf("direct file access") } } guard:: strategy(common) { openFile = function(filename::string):: int; assign_zek_access(files) { printf("virtualized file access") } } main = function:: int; entry { context:: assign_sizo(zoneA); assign_sizo_control(files). openFile("/some/file") } - Example outlines dummy function + The example outlines a dummy function openFile to model file system - access. The function has two specializations with guards strategy(direct) - to model direct access, and strategy(common) - to be invoked if virtualization is enabled. It also annotated with + access. The function includes two specializations with the strategy(direct) + guard to model direct access, and the strategy(common)guard + to be invoked if virtualization is enabled. It is also annotated with assign_zek_access(files) to indicate - that it accesses file system. On the other hand, context of function - main defines sizo zoneA - and enables control over file operations. - - Reasoning works with provided information - and decides whether it is necessary to enable virtualization. In this - case, the answer is yes for zoneA, - because of the fact that sizo controls file operations and there is - actually function within the sizo that requires files access. - Consequently, example outputs:virtualized file accessconfirming + that it accesses the file system. On the other hand, the context of the + function main defines the + zoneA sizo and enables control over + file operations. + + Reasoning works with the provided + information and decides whether it is necessary to enable virtualization. + In this case, the answer is yes for zoneA, + because of the fact that the sizo controls file operations and that there + is actually a function within the sizo that requires file access. + Consequently, the example outputs the following:virtualized file accessconfirming that specifically virtualized specialization of openFile was invoked.
<?xxe-sn 26yv439af40 3i?>Isolation As shown in the previous section, it is - possible to enable(or disable) virtualization on per resource basis. - However such functionality is limited in a sense that if several sizos + possible to enable (or disable) virtualization on a per resource basis. + However, such functionality is limited in a sense that if several sizos allow access to the same resource they can interfere with each other. Thus, next step to consider is isolation, - i.e. zeks in different sizos should not have ability to access the same - resource, but rather work with their own set of resources associated with - particular sizo. As previously, following examples are focused on file - operations as most ubiquitous type of resources. + i.e. zeks in different sizos should not have the ability to access the + same resource, but rather work with their own set of resources associated + with a particular sizo. The following examples, just as the previous one, + are focused on file operations as the most ubiquitous type of + resources. One way to isolate file access is to - associate unique file prefix with each sizo. If virtualization enabled, - all filenames in the sizo are silently transformed on the fly by adding - assigned prefix. This way, all the file operations from one sizo are - confined within specific directory allocated solely for that particular - sizo or simply have unique prefix if the same directory contains files - belonging to a different sizos. + associate a unique file prefix with each sizo. If virtualization is + enabled, all filenames in the sizo are silently transformed on the fly by + adding an assigned prefix. This way, all the file operations from one sizo + are confined within a specific directory allocated solely for that + particular sizo, or simply have a unique prefix if the same directory + contains files belonging to a different sizo(s). name="tests/virtualization.cpp: Doc_Isolation_1", lines=15 main = function:: int; entry { seq { context:: assign_sizo(domainA); assign_sizo_control(files). openFile("test") } { context:: assign_sizo(domainA). openFile("test") } { context:: assign_sizo(domainB); assign_sizo_control(files). openFile("test") } } - In this example file test + In this example, the file test is accessed from different sizos domainA - and domanB. As there are several - "competing" sizos are declared, they are isolated and openFile + and domanB. As several "competing" + sizos are declared, they are isolated, and openFile resolves test to a different - filename depending on from which sizo it is called. One possible way to - implement discussed strategy shown below: + filename depending on which sizo it was called from. One possible way to + implement the discussed strategy is shown below: name="tests/virtualization.cpp: Doc_Isolation_1", lines=15 import raw ("scripts/cfa/context.lp"). //enable context reasoning import raw ("scripts/virtualization/virtualization.lp"). //enable virtualization reasoning import raw ("scripts/virtualization/test-Isolation_1.assembly.lp"). //additional configuration DictSizo = type slave dict_sizo. Sizo = type slave virt_sizo. guard:: strategy(direct) { resolveFilename = function(filename:: string):: string; assign_zek_access(files) { filename } } guard:: strategy(prefix) { resolveFilename = function(filename:: string):: string; assign_zek_access(files) { dictSizo = intrinsic query("dict_sizo")::[DictSizo]. sizoId = intrinsic query late("sizo_current"->sizoCurrent:: Sizo):: int; demand(sizo) { loop fold(dictSizo->entry::DictSizo, 0->id):: int { if(entry[0] == sizoCurrent):: int { entry[1] } else { id } } }. buf = "00"::string. seq { sprintf(buf, "%d/%s", sizoId, filename) } { buf } } } openFile = function(filename:: string):: int { filenameReal = resolveFilename(filename):: string. printf("File opened: '%s'%c", filenameReal, 10) } Example outputs: File opened: '0/test' File opened: '0/test' File opened: '1/test' - In this example function openFile - calls resolveFilename to find out - real filename. It can be said, that resolveFilername + In this example, the function + openFile calls resolveFilename + to find out the real filename. It can be said that resolveFilename serves as hypervisor - dereferencing file pseudonym into real filename. In order to do that, - resolveFilername consists of two + dereferencing file pseudonym into a real filename. For this purpose, + resolveFilename consists of two specializations: specialization strategy(direct) - serves non virtualized environmentб leaving filename without any - processing, and the other specialization strategy(prefix) - implements resolving strategy by adding sizo-associated prefix to each - file. More specifically, unique index is assigned to each sizo and - resolveFilename uses the index as + serves the non-virtualized environment leaving the filename without any + processing, while the other specialization strategy(prefix) + implements the resolving strategy by adding a sizo-associated prefix to + each file. More specifically, a unique index is assigned to each sizo, and + resolveFilename uses the index as a file name prefix. Resolution function resolveFilename has only one parameter filename, - deriving required prefix from late - context associated with particular sizo. + deriving the required prefix from late + context associated with a particular sizo. Client code has no way to influence - resolving process and force to use unapproved prefix, thus accessing and - interfering with files that belong to other sizos. + resolving process and force it to use an unapproved prefix, thus + accessing and interfering with files that belong to other sizos.
<?xxe-sn 26yv439af40 4c?>Isolation Categories - Every optimization technique applicable - only if some specific preconditions are met. Indeed, only general approach - can handle general task. However, for practical instances there are always - some improvements possible by tailoring to the particular use case - specifics and subtle details. In other words, the more information - available the more space for improvements is there. And first step on this - road is the very ability to express and reason about such additional + Every optimization technique is applicable + only if certain preconditions are met. Indeed, only a general approach can + handle a general task. However, for practical instances some improvements + are always possible by tailoring to a particular use case specifics and + subtle details. In other words, the more information available, the more + space there is for improvements. And the first step along this road is the + very ability to express and reason about such additional information. As a demonstration, in order to improve - reasoning to find out optimal virtualization strategy for particular use - case, different sizo + reasoning to find out the optimal virtualization strategy for a particular + use case, different sizo categories can be introduced, as below: Inward Isolation. The category describes sizo that prohibits - access from other sizos to its internal resources, but able to access + access of other sizos to its internal resources, but is able to access external resources freely. For example, monitoring and supervision - software may have been assigned this type of isolation — freely - accesses subordinate zones but can't be influenced from the + software may have been assigned this type of isolation, where it + freely accesses subordinate zones but cannot be influenced from the outside. Outward isolation. The exact opposite of inward isolation. Allows - access from external sizos but is only allowed to use its own internal - resources, so no influence to the outside world is possible. - Appropriate for various sandboxes and testing environments to run - possibly insecure code. + access from external sizos, but is only allowed to use its own + internal resources, so no influence over the outside world is + possible. Appropriate for various sandboxes and testing environments + to run a potentially insecure code. - For file operations inward isolation may be - implemented as virtualization strategy that requires from + For file operations, inward isolation may + be implemented as a virtualization strategy that requires from other sizos compulsory usage - of file prefixes so no other sizo can access internal data of inwardly - isolated sizo. Conversely, outward isolation is compatible with strategy - that assigns prefix for this very sizo, so it can in no way access any - external data, being at the same time exposed to the outside world for any - sizo that have permission to know unique assigned prefix able to access - internal data of sizo in question. To put it simply, strategy for these - types can be described with following points: + of file prefixes so that no other sizo could access internal data of + inwardly isolated sizo. Conversely, outward isolation is compatible with + the strategy that involves assigning a prefix for this very sizo, so it + can in no way access any external data, being at the same time exposed to + the outside world and any sizo that has a permission to know a unique + assigned prefix able to access the internal data of the sizo in question. + To put it simply, strategy for these types can be described with following + points: Inward isolation — requires prefixes for other sizos. - Outward isolation — requires prefix for - itself. + Outward isolation — requires a prefix + for itself. There is an annotation introduced to - declare category for the current sizo: + declare a category for a current sizo: SYNTAX: **assign_sizo_category(inward)**. **assign_sizo_category(outward)**. Consider the example below: name="tests/virtualization.cpp: Doc_IsolationCat_1", lines=15 test = function:: int; entry { seq { context:: assign_sizo(zoneA); assign_sizo_control(files); assign_sizo_category(inward). openFile("test1") } { context:: assign_sizo(zoneB); assign_sizo_control(files); assign_sizo_category(outward). openFile("test1") } } There are two sizos declared in the code - above. Using reasoning apparatus developed in previous sections, both - sizos activate virtualization, for both of them control file resources and - both contain openFile that actually - requires file access. However, this time additional bits of information - are available, namely zoneA and - zoneB declared as inward and - outward, respectively. By strategy outlines above, zoneA - enables prefix based isolation strategy for zoneB, - and zoneB enables isolation for - itself as well. As a result, it's enough to virtualize only one - zone(zoneB) leaving zoneA - to enjoy direct access to file resources. Example output's is shown below, - confirming that direct file access is granted for zoneA: + above. Using the reasoning apparatus developed in the previous sections, + both sizos activate virtualization, for both of them control some file + resources and both contain openFile + that actually requires file access. However, this time additional bits of + information are available, namely zoneA + and zoneB are declared as inward and + outward, respectively. According to the strategy outlined above, + zoneA enables prefix based isolation + strategy for zoneB, and + zoneB enables isolation for itself + as well. As a result, it is enough to virtualize only one zone + (zoneB) leaving zoneA + to enjoy a direct access to file resources. Example outputs are shown + below, confirming that the direct file access is granted to + zoneA: File opened: 'test1' File opened: '1/test1'
\ No newline at end of file diff --git a/pandoctl b/pandoctl index 61a06bf..005ceed 100755 --- a/pandoctl +++ b/pandoctl @@ -1,13 +1,13 @@ #!/bin/bash case $1 in convert) Name=$(basename $2) sed -e '// d' -e 's///g' $2 | \ /opt/pandoc/dist/build/pandoc/pandoc \ -f docbook \ -t ./documentation-tools/doc-converter/remarkup.lua \ - -o /tmp/docs/$Name.remarkup + -o $3/$Name.remarkup ;; - *) echo "usage: $0 convert ";; + *) echo "usage: $0 convert ";; esac diff --git a/vendors/coco/generator/Copyright.frame b/vendors/coco/generator/Copyright.frame new file mode 120000 index 0000000..708b5d5 --- /dev/null +++ b/vendors/coco/generator/Copyright.frame @@ -0,0 +1 @@ +/usr/share/coco-cpp/Copyright.frame \ No newline at end of file diff --git a/vendors/coco/generator/Scanner.frame b/vendors/coco/generator/Scanner.frame new file mode 120000 index 0000000..d1ee132 --- /dev/null +++ b/vendors/coco/generator/Scanner.frame @@ -0,0 +1 @@ +/usr/share/coco-cpp/Scanner.frame \ No newline at end of file