kohana-markdown/classes/Kohana/Markdown.php

1598 lines
44 KiB
PHP

<?php defined('SYSPATH') or die('No direct script access.');
/**
* A text-to-HTML conversion tool for web writers for Kohana 3.
*
* Based off the php created by Michel Fortin under the gpl lisense. For more
* information, please refer to:
* <http://michelf.com/projects/php-markdown/>
*
* Original Markdown concept by John Gruber. Please refer to:
* <http://daringfireball.net/projects/markdown/>
*
* Markdown is a text-to-HTML filter; it translates an easy-to-read /
* easy-to-write structured text format into HTML. Markdown's text format
* is most similar to that of plain text email, and supports features such
* as headers, *emphasis*, code blocks, blockquotes, and links.
*
* Markdown's syntax is designed not as a generic markup language, but
* specifically to serve as a front-end to (X)HTML. You can use span-level
* HTML tags anywhere in a Markdown document, and you can use block level
* HTML tags (like <div> and <table> as well).
*
* For more information about Markdown's syntax, see:
* <http://daringfireball.net/projects/markdown/>
*
* @Gary Stidston-Broadbent <kohana_api@stroppytux.net>
* @package Markdown
* @copyright (c) 2010 Unmagnify team
* @license http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
* @version $id$
* @link http://www.stroppytux.net/projects/kohana-markdown/
* @since Available since Release 1.0
*/
class Kohana_Markdown
{
/* Paramaters used for singleton */
public static $instances = array();
public static $default = 'default';
protected $_config;
/* Regex to use in the Validation class */
public static $validate = '/[[:punct:][:space:]]{0,}/';
/* Regex to match balanced [brackets] used to insert max bracked depth */
protected $nested_url_parenthesis_depth = 4;
protected $nested_url_parenthesis_re;
protected $nested_brackets_depth = 6;
protected $nested_brackets_re;
/* Table of hash values for escaped characters: */
protected $escape_chars = '\`*_{}[]()>#+-.!';
protected $escape_chars_re;
/* Change to ">" for HTML output. */
protected $suffix = ' />';
protected $tab_width = 4;
/* Change to `true` to disallow markup or entities. */
protected $no_entities = false;
protected $no_markup = false;
/* Predefined urls and titles for reference links and images. */
protected $predef_urls = array();
protected $predef_titles = array();
/* Internal hashes used during transformation. */
protected $urls = array();
protected $titles = array();
protected $html_hashes = array();
/* Status flag to avoid invalid nesting. */
protected $in_anchor = false;
protected $list_level = 0;
/* Strip link definitions, store in hashes. */
protected $document_gamut = array(
"strip_link_definitions"=> 20,
"run_basic_block_gamut" => 30,
);
/* Transformations to form block-level tags. eg. p, h1,h2,h3, ol,ul. */
protected $block_gamut = array(
"do_headers" => 10,
"do_horizontal_rules" => 20,
"do_lists" => 40,
"do_code_blocks" => 50,
"do_block_quotes" => 60,
);
/* Transformations that occur *within* block-level tags */
protected $span_gamut = array(
/* Process char escapes, code spans, and inline HTML in one shot. */
"parseSpan" => -30,
/* Process anchor and image tags. Images must come first, because
* ![foo][f] looks like an anchor.
*/
"do_images" => 10,
"do_anchors" => 20,
/* Make links out of things like `<http://example.com/>`. Must come
* after do_anchors, because you can use < and > delimiters in inline
* links like [this](<url>).
*/
"doAutoLinks" => 30,
"encode_amps_and_angles" => 40,
"do_italics_and_bold" => 50,
"do_hard_breaks" => 60,
);
protected $em_relist = array(
'' => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?=\S|$)(?![.,:;]\s)',
'*' => '(?<=\S|^)(?<!\*)\*(?!\*)',
'_' => '(?<=\S|^)(?<!_)_(?!_)',
);
protected $strong_relist = array(
'' => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?=\S|$)(?![.,:;]\s)',
'**' => '(?<=\S|^)(?<!\*)\*\*(?!\*)',
'__' => '(?<=\S|^)(?<!_)__(?!_)',
);
protected $em_strong_relist = array(
'' => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?=\S|$)(?![.,:;]\s)',
'***' => '(?<=\S|^)(?<!\*)\*\*\*(?!\*)',
'___' => '(?<=\S|^)(?<!_)___(?!_)',
);
protected $em_strong_prepared_relist;
/* String length function for detab. `init_detab` will create a function to
* hanlde UTF-8 if the default function does not exist.
*/
protected $utf8_strlen = 'mb_strlen';
/**
* Constructs the markdown parser. This method cannot be invoked externally.
* The markdown parser must be instantiated using `Markdown::instance()`.
*
* @param array config
* @throws Kohana_Markdown_Exception
*/
protected function __construct(array $config)
{
/* Load our configuration options */
$this->_config = $config;
$this->suffix = ($this->_config['tab_width'] == 'html') ? '>' : ' />';
$this->tab_width = $this->_config['tab_width'];
$this->no_entities = $this->_config['no_entities'];
$this->no_markup = $this->_config['no_markup'];
$this->init_detab();
$this->prepare_italics_and_bold();
$this->nested_brackets_re =
str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth).
str_repeat('\])*', $this->nested_brackets_depth);
$this->nested_url_parenthesis_re =
str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth).
str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
$this->escape_chars_re = '['.preg_quote($this->escape_chars).']';
# Sort document, block, and span gamut in ascendent priority order.
asort($this->document_gamut);
asort($this->block_gamut);
asort($this->span_gamut);
}
/**
* Creates a singleton of a Kohana Markdown parser. If no parser is supplied
* the __default__ markdown parser is used.
*
* // Create an instance of the default parser
* $default_parser = Markdown::instance();
*
* // Create an instance of a parser
* $foo_parser = Markdown::instance('foo');
*
* // Access an instantiated parser directly
* $foo_parser = Markdown::$instances['default'];
*
* @param string the name of the markdown parser to use [Optional]
* @return Markdown
* @throws Kohana_Markdown_Exception
*/
public static function instance($parser = NULL)
{
/* If there is no parser supplied, use the default parser */
if ($parser === NULL) {
$parser = Markdown::$default;
}
/* Return the current parser if its already initiated */
if (isset(Markdown::$instances[$parser])) {
return Markdown::$instances[$parser];
}
$config = Kohana::$config->load('markdown');
if (!$config->offsetExists($parser)) {
throw new Kohana_Markdown_Exception('Failed to load Kohana Markdown parser: :parser', array(':parser' => $parser));
}
$config = $config->get($parser);
/* Create a new markdown instance */
Markdown::$instances[$parser] = new Markdown($config);
/* Return the instance */
return Markdown::$instances[$parser];
}
/**
* Overload the __clone() method to prevent cloning
*
* @return void
* @throws Kohana_Markdown_Exception
*/
public function __clone()
{
throw new Kohana_Markdown_Exception('Cloning of Kohana_Markdown objects is forbidden');
}
/**
* Called before the transformation process starts to setup parser states.
*
* @return void
*/
protected function before()
{
/* Clear out the global hashes. */
$this->urls = $this->predef_urls;
$this->titles = $this->predef_titles;
$this->html_hashes = array();
$in_anchor = false;
}
/**
* Called after the transformation process to clear any variable which may
* be taking up memory unnecessarly.
*
* @return void
*/
protected function after()
{
$this->urls = array();
$this->titles = array();
$this->html_hashes = array();
}
/**
* Main function. Performs some preprocessing on the input text and pass it
* through the document gamut.
*
* @param string The markdown content to be transformed into (x)html.
* @return string The transformed (x)html.
*/
public function transform($text)
{
/* Initialise the storage arrays */
$this->before();
/* Remove UTF-8 BOM and marker character in input, if present */
$text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
/* Standardize line endings: DOS to Unix and Mac to Unix */
$text = preg_replace('{\r\n?}', "\n", $text);
/* Make sure $text ends with a couple of newlines */
$text .= "\n\n";
/* Convert all tabs to spaces. */
$text = $this->detab($text);
/* Turn block-level HTML blocks into hash entries */
$text = $this->hash_HTML_blocks($text);
/* Strip any lines consisting only of spaces and tabs. This makes any
* subsequent regexen easier to write, because we can match consecutive
* blank lines with /\n+/ instead of something contorted like /[ ]*\n+/
*/
$text = preg_replace('/^[ ]+$/m', '', $text);
/* Run document gamut methods */
foreach ($this->document_gamut as $method => $priority) {
$text = $this->$method($text);
}
/* Cleanup memory */
$this->after();
/* Return the new (x)html content */
return $text . "\n";
}
/**
* Strips link definitions from text, stores the URLs and titles in hash
* references.
*
* @param string The markdown getting transformed.
* @return string The resulting hash references for link definitions.
*/
protected function strip_link_definitions($text)
{
$less_than_tab = $this->tab_width - 1;
/* Link defs are in the form: ^[id]: url "optional title" */
$text = preg_replace_callback('{
^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
[ ]*
\n? # maybe *one* newline
[ ]*
(?:
<(.+?)> # url = $2
|
(\S+?) # url = $3
)
[ ]*
\n? # maybe one newline
[ ]*
(?:
(?<=\s) # lookbehind for whitespace
["(]
(.*?) # title = $4
[")]
[ ]*
)? # title is optional
(?:\n+|\Z)
}xm',
array(&$this, '_strip_link_definitions_callback'), $text);
return $text;
}
protected function _strip_link_definitions_callback($matches)
{
$link_id = strtolower($matches[1]);
$url = $matches[2] == '' ? $matches[3] : $matches[2];
$this->urls[$link_id] = $url;
$this->titles[$link_id] =& $matches[4];
return ''; /* String that will replace the block */
}
/**
* Hashify HTML blocks:
* We only want to do this for block-level HTML tags, such as headers,
* lists, and tables. That's because we still want to wrap <p>s around
* "paragraphs" that are wrapped in non-block-level tags, such as anchors,
* phrase emphasis, and spans. The list of tags we're looking for is
* hard-coded:
*
* * List "a" is made of tags which can be both inline or block-level.
* These will be treated block-level when the start tag is alone on
* its line, otherwise they're not matched here and will be taken as
* inline later.
* * List "b" is made of tags which are always block-level;
*
* @param string The markdown getting transformed.
* @return string The resulting hash references for link definitions.
*/
protected function hash_HTML_blocks($text)
{
if ($this->no_markup) return $text;
$less_than_tab = $this->tab_width - 1;
$block_tags_a_re = 'ins|del';
$block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
'script|noscript|form|fieldset|iframe|math';
/* Regular expression for the content of a block tag. */
$nested_tags_level = 4;
$attr = '
(?> # optional tag attributes
\s # starts with whitespace
(?>
[^>"/]+ # text outside quotes
|
/+(?!>) # slash not followed by ">"
|
"[^"]*" # text inside double quotes (tolerate ">")
|
\'[^\']*\' # text inside single quotes (tolerate ">")
)*
)?
';
$content =
str_repeat('
(?>
[^<]+ # content without tag
|
<\2 # nested opening tag
'.$attr.' # attributes
(?>
/>
|
>', $nested_tags_level). # end of opening tag
'.*?'. # last level nested tag content
str_repeat('
</\2\s*> # closing nested tag
)
|
<(?!/\2\s*> # other tags with a different name
)
)*',
$nested_tags_level);
$content2 = str_replace('\2', '\3', $content);
/* First, look for nested blocks, e.g.:
* <div>
* <div>
* tags for inner block must be indented.
* </div>
* </div>
*
* The outermost tags must start at the left margin for this to match, and
* the inner nested divs must be indented.
* We need to do this before the next, more liberal match, because the next
* match will start at the first `<div>` and stop at the first `</div>`.
*/
$text = preg_replace_callback('{(?>
(?>
(?<=\n\n) # Starting after a blank line
| # or
\A\n? # the beginning of the doc
)
( # save in $1
# Match from `\n<tag>` to `</tag>\n`, handling nested tags
# in between.
[ ]{0,'.$less_than_tab.'}
<('.$block_tags_b_re.')# start tag = $2
'.$attr.'> # attributes followed by > and \n
'.$content.' # content, support nesting
</\2> # the matching end tag
[ ]* # trailing spaces/tabs
(?=\n+|\Z) # followed by a newline or end of document
| # Special version for tags of group a.
[ ]{0,'.$less_than_tab.'}
<('.$block_tags_a_re.')# start tag = $3
'.$attr.'>[ ]*\n # attributes followed by >
'.$content2.' # content, support nesting
</\3> # the matching end tag
[ ]* # trailing spaces/tabs
(?=\n+|\Z) # followed by a newline or end of document
| # Special case just for <hr />. It was easier to make a special
# case than to make the other regex more complicated.
[ ]{0,'.$less_than_tab.'}
<(hr) # start tag = $2
'.$attr.' # attributes
/?> # the matching end tag
[ ]*
(?=\n{2,}|\Z) # followed by a blank line or end of document
| # Special case for standalone HTML comments:
[ ]{0,'.$less_than_tab.'}
(?s:
<!-- .*? -->
)
[ ]*
(?=\n{2,}|\Z) # followed by a blank line or end of document
| # PHP and ASP-style processor instructions (<? and <%)
[ ]{0,'.$less_than_tab.'}
(?s:
<([?%]) # $2
.*?
\2>
)
[ ]*
(?=\n{2,}|\Z) # followed by a blank line or end of document
)
)}Sxmi',
array(&$this, '_hash_HTML_blocks_callback'),
$text);
return $text;
}
protected function _hash_HTML_blocks_callback($matches)
{
$text = $matches[1];
$key = $this->hash_block($text);
return "\n\n$key\n\n";
}
/**
* Called whenever a tag must be hashed when a function insert an atomic
* element in the text stream. Passing $text to through this function gives
* a unique text-token which will be reverted back when calling unhash.
*
* The $boundary argument specify what character should be used to surround
* the token. By convension, "B" is used for block elements that needs not
* to be wrapped into paragraph tags at the end, ":" is used for elements
* that are word separators and "X" is used in the general case.
*
* @param string The markdown getting transformed.
* @param string The boundry to use to surround a token.
* @return string String that will replace the tag.
*/
protected function hash_part($text, $boundary = 'X')
{
/* Swap back any tag hash found in $text so we dont have to `unhash`
* multiple times at the end.
*/
$text = $this->unhash($text);
/* Then hash the block. */
static $i = 0;
$key = "$boundary\x1A" . ++$i . $boundary;
$this->html_hashes[$key] = $text;
return $key;
}
/**
* Shortcut function for hash_part with block-level boundaries.
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
protected function hash_block($text)
{
return $this->hash_part($text, 'B');
}
/**
* Run block gamut tranformations.
*
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
protected function run_block_gamut($text)
{
/* We need to escape raw HTML in Markdown source before doing anything
* else. This need to be done for each block, and not only at the
* begining in the Markdown function since hashed blocks can be part of
* list items and could have been indented. Indented blocks would have
* been seen as a code block in a previous pass of hash_HTML_blocks.
*/
$text = $this->hash_HTML_blocks($text);
return $this->run_basic_block_gamut($text);
}
/**
* Run block gamut tranformations, without hashing HTML blocks. This is
* useful when HTML blocks are known to be already hashed, like in the first
* whole-document pass.
*
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
protected function run_basic_block_gamut($text)
{
foreach ($this->block_gamut as $method => $priority) {
$text = $this->$method($text);
}
# Finally form paragraph and restore hashed blocks.
$text = $this->form_paragraphs($text);
return $text;
}
/**
* Add horizontal rules to the output.
*
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
protected function do_horizontal_rules($text)
{
return preg_replace(
'{
^[ ]{0,3} # Leading space
([-*_]) # $1: First marker
(?> # Repeated marker group
[ ]{0,2} # Zero, one, or two spaces.
\1 # Marker character
){2,} # Group repeated at least twice
[ ]* # Tailing spaces
$ # End of line.
}mx',
"\n".$this->hash_block("<hr$this->suffix")."\n",
$text);
}
/**
* Run span gamut tranformations.
*
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
protected function run_span_gamut($text)
{
foreach ($this->span_gamut as $method => $priority) {
$text = $this->$method($text);
}
return $text;
}
/**
* Do hard breaks:
*
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
protected function do_hard_breaks($text)
{
return preg_replace_callback('/ {2,}\n/',
array(&$this, '_do_hard_breaks_callback'), $text);
}
protected function _do_hard_breaks_callback($matches)
{
return $this->hash_part("<br$this->suffix\n");
}
/**
* Turn Markdown link shortcuts into (x)html <a> tags.
*
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
function do_anchors($text)
{
if ($this->in_anchor) return $text;
$this->in_anchor = true;
/* First, handle reference-style links: [link text] [id] */
$text = preg_replace_callback('{
( # wrap whole match in $1
\[
('.$this->nested_brackets_re.') # link text = $2
\]
[ ]? # one optional space
(?:\n[ ]*)? # one optional newline followed by spaces
\[
(.*?) # id = $3
\]
)
}xs',
array(&$this, '_do_anchors_reference_callback'), $text);
/* Next, inline-style links: [link text](url "optional title") */
$text = preg_replace_callback('{
( # wrap whole match in $1
\[
('.$this->nested_brackets_re.') # link text = $2
\]
\( # literal paren
[ \n]*
(?:
<(.+?)> # href = $3
|
('.$this->nested_url_parenthesis_re.') # href = $4
)
[ \n]*
( # $5
([\'"]) # quote char = $6
(.*?) # Title = $7
\6 # matching quote
[ \n]* # ignore any spaces/tabs between closing quote and )
)? # title is optional
\)
)
}xs',
array(&$this, '_do_anchors_inline_callback'), $text);
/* Last, handle reference-style shortcuts: [link text]. These must come
* last in case you've also got [link text][1] or [link text](/foo).
*/
$text = preg_replace_callback('{
( # wrap whole match in $1
\[
([^\[\]]+) # link text = $2; can\'t contain [ or ]
\]
)
}xs',
array(&$this, '_do_anchors_reference_callback'), $text);
$this->in_anchor = false;
return $text;
}
protected function _do_anchors_reference_callback($matches)
{
$whole_match = $matches[1];
$link_text = $matches[2];
$link_id =& $matches[3];
/* for shortcut links like [this][] or [this]. */
if ($link_id == "") {
$link_id = $link_text;
}
/* lower-case and turn embedded newlines into spaces */
$link_id = strtolower($link_id);
$link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
if (isset($this->urls[$link_id])) {
$url = $this->urls[$link_id];
$url = $this->encode_attribute($url);
$result = "<a href=\"$url\"";
if ( isset( $this->titles[$link_id] ) ) {
$title = $this->titles[$link_id];
$title = $this->encode_attribute($title);
$result .= " title=\"$title\"";
}
$link_text = $this->run_span_gamut($link_text);
$result .= ">$link_text</a>";
$result = $this->hash_part($result);
} else {
$result = $whole_match;
}
return $result;
}
protected function _do_anchors_inline_callback($matches)
{
$whole_match = $matches[1];
$link_text = $this->run_span_gamut($matches[2]);
$url = $matches[3] == '' ? $matches[4] : $matches[3];
$title =& $matches[7];
$url = $this->encode_attribute($url);
$result = "<a href=\"$url\"";
if (isset($title)) {
$title = $this->encode_attribute($title);
$result .= " title=\"$title\"";
}
$link_text = $this->run_span_gamut($link_text);
$result .= ">$link_text</a>";
return $this->hash_part($result);
}
/**
* Turn Markdown image shortcuts into <img> tags.
*
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
protected function do_images($text)
{
/* First, handle reference-style labeled images: ![alt text][id] */
$text = preg_replace_callback('{
( # wrap whole match in $1
!\[
('.$this->nested_brackets_re.') # alt text = $2
\]
[ ]? # one optional space
(?:\n[ ]*)? # one optional newline followed by spaces
\[
(.*?) # id = $3
\]
)
}xs',
array(&$this, '_do_images_reference_callback'), $text);
/* Next, handle inline images: ![alt text](url "optional title"). */
$text = preg_replace_callback('{
( # wrap whole match in $1
!\[
('.$this->nested_brackets_re.') # alt text = $2
\]
\s? # One optional whitespace character
\( # literal paren
[ \n]*
(?:
<(\S*)> # src url = $3
|
('.$this->nested_url_parenthesis_re.') # src url = $4
)
[ \n]*
( # $5
([\'"]) # quote char = $6
(.*?) # title = $7
\6 # matching quote
[ \n]*
)? # title is optional
\)
)
}xs',
array(&$this, '_do_images_inline_callback'), $text);
return $text;
}
protected function _do_images_reference_callback($matches)
{
$whole_match = $matches[1];
$alt_text = $matches[2];
$link_id = strtolower($matches[3]);
/* for shortcut links like ![this][]. */
if ($link_id == "") {
$link_id = strtolower($alt_text);
}
$alt_text = $this->encode_attribute($alt_text);
if (isset($this->urls[$link_id])) {
$url = $this->encode_attribute($this->urls[$link_id]);
$result = "<img src=\"$url\" alt=\"$alt_text\"";
if (isset($this->titles[$link_id])) {
$title = $this->titles[$link_id];
$title = $this->encode_attribute($title);
$result .= " title=\"$title\"";
}
$result .= $this->suffix;
$result = $this->hash_part($result);
/* If there's no such link ID, leave intact: */
} else {
$result = $whole_match;
}
return $result;
}
protected function _do_images_inline_callback($matches)
{
$whole_match = $matches[1];
$alt_text = $matches[2];
$url = $matches[3] == '' ? $matches[4] : $matches[3];
$title =& $matches[7];
$alt_text = $this->encode_attribute($alt_text);
$url = $this->encode_attribute($url);
$result = "<img src=\"$url\" alt=\"$alt_text\"";
/* $title already quoted */
if (isset($title)) {
$title = $this->encode_attribute($title);
$result .= " title=\"$title\"";
}
$result .= $this->suffix;
return $this->hash_part($result);
}
/**
* Turn Markdown headers into (x)html h[1-6] tags.
*
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
protected function do_headers($text)
{
/* Setext-style headers:
* Header 1
* ========
*
* Header 2
* --------
*/
$text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx',
array(&$this, '_do_headers_callback_setext'), $text);
/* atx-style headers:
* # Header 1
* ## Header 2
* ## Header 2 with closing hashes ##
* ...
* ###### Header 6
*/
$text = preg_replace_callback('{
^(\#{1,6}) # $1 = string of #\'s
[ ]*
(.+?) # $2 = Header text
[ ]*
\#* # optional closing #\'s (not counted)
\n+
}xm',
array(&$this, '_do_headers_callback_atx'), $text);
return $text;
}
protected function _do_headers_callback_setext($matches)
{
/* Terrible hack to check we haven't found an empty list item. */
if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1]))
return $matches[0];
$level = $matches[2]{0} == '=' ? 1 : 2;
$block = "<h$level>".$this->run_span_gamut($matches[1])."</h$level>";
return "\n" . $this->hash_block($block) . "\n\n";
}
protected function _do_headers_callback_atx($matches)
{
$level = strlen($matches[1]);
$block = "<h$level>".$this->run_span_gamut($matches[2])."</h$level>";
return "\n" . $this->hash_block($block) . "\n\n";
}
/**
* Form HTML ordered (numbered) and unordered (bulleted) lists.
*
* @param string The markdown getting transformed.
* @return string String that will replace the tag.
*/
protected function do_lists($text)
{
$less_than_tab = $this->tab_width - 1;
/* Re-usable patterns to match list item bullets and number markers: */
$marker_ul_re = '[*+-]';
$marker_ol_re = '\d+[.]';
$marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
$markers_relist = array(
$marker_ul_re => $marker_ol_re,
$marker_ol_re => $marker_ul_re,
);
foreach ($markers_relist as $marker_re => $other_marker_re) {
/* Re-usable pattern to match any entirel ul or ol list: */
$whole_list_re = '
( # $1 = whole list
( # $2
([ ]{0,'.$less_than_tab.'}) # $3 = number of spaces
('.$marker_re.') # $4 = first list item marker
[ ]+
)
(?s:.+?)
( # $5
\z
|
\n{2,}
(?=\S)
(?! # Negative lookahead for another list item marker
[ ]*
'.$marker_re.'[ ]+
)
|
(?= # Lookahead for another kind of list
\n
\3 # Must have the same indentation
'.$other_marker_re.'[ ]+
)
)
)
';
/* We use different prefix before nested lists than top-level lists.
* See extended comment in _process_list_items().
*/
if ($this->list_level) {
$text = preg_replace_callback('{
^
'.$whole_list_re.'
}mx',
array(&$this, '_do_lists_callback'), $text);
} else {
$text = preg_replace_callback('{
(?:(?<=\n)\n|\A\n?) # Must eat the newline
'.$whole_list_re.'
}mx',
array(&$this, '_do_lists_callback'), $text);
}
}
return $text;
}
protected function _do_lists_callback($matches)
{
/* Re-usable patterns to match list item bullets and number markers: */
$marker_ul_re = '[*+-]';
$marker_ol_re = '\d+[.]';
$marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
$list = $matches[1];
$list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol";
$marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re );
$list .= "\n";
$result = $this->process_list_items($list, $marker_any_re);
$result = $this->hash_block("<$list_type>\n" . $result . "</$list_type>");
return "\n". $result ."\n\n";
}
/**
* Process the contents of a single ordered or unordered list, splitting it
* into individual list items.
*/
protected function process_list_items($list_str, $marker_any_re)
{
/*
* The $this->list_level global keeps track of when we're inside a list.
* Each time we enter a list, we increment it; when we leave a list,
* we decrement. If it's zero, we're not in a list anymore.
*
* We do this because when we're not inside a list, we want to treat
* something like this:
*
* I recommend upgrading to version
* 8. Oops, now this line is treated
* as a sub-list.
*
* As a single paragraph, despite the fact that the second line starts
* with a digit-period-space sequence.
*
* Whereas when we're inside a list (or sub-list), that line will be
* treated as the start of a sub-list. What a kludge, huh? This is
* an aspect of Markdown's syntax that's hard to parse perfectly
* without resorting to mind-reading. Perhaps the solution is to
* change the syntax rules such that sub-lists must start with a
* starting cardinal number; e.g. "1." or "a.".
*/
$this->list_level++;
/* trim trailing blank lines: */
$list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
$list_str = preg_replace_callback('{
(\n)? # leading line = $1
(^[ ]*) # leading whitespace = $2
('.$marker_any_re.' # list marker and space = $3
(?:[ ]+|(?=\n)) # space only required if item is not empty
)
((?s:.*?)) # list item text = $4
(?:(\n+(?=\n))|\n) # tailing blank line = $5
(?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n))))
}xm',
array(&$this, '_process_list_items_callback'), $list_str);
$this->list_level--;
return $list_str;
}
protected function _process_list_items_callback($matches)
{
$item = $matches[4];
$leading_line =& $matches[1];
$leading_space =& $matches[2];
$marker_space = $matches[3];
$tailing_blank_line =& $matches[5];
/* Replace marker with the appropriate whitespace indentation */
if ($leading_line || $tailing_blank_line ||
preg_match('/\n{2,}/', $item))
{
$item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
$item = $this->run_block_gamut($this->outdent($item)."\n");
/* Recursion for sub-lists: */
} else {
$item = $this->do_lists($this->outdent($item));
$item = preg_replace('/\n+$/', '', $item);
$item = $this->run_span_gamut($item);
}
return "<li>" . $item . "</li>\n";
}
/**
* Process Markdown `<pre><code>` blocks.
*/
protected function do_code_blocks($text)
{
$text = preg_replace_callback('{
(?:\n\n|\A\n?)
( # $1 = the code block -- one or more lines, starting with a space/tab
(?>
[ ]{'.$this->tab_width.'} # Lines must start with a tab or a tab-width of spaces
.*\n+
)+
)
((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
}xm',
array(&$this, '_do_code_blocks_callback'), $text);
return $text;
}
protected function _do_code_blocks_callback($matches)
{
$codeblock = $matches[1];
$codeblock = $this->outdent($codeblock);
$codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES);
/* trim leading newlines and trailing newlines */
$codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock);
$codeblock = "<pre><code>$codeblock\n</code></pre>";
return "\n\n".$this->hash_block($codeblock)."\n\n";
}
/**
* Create a code span markup for $code. Called from handleSpanToken.
*/
protected function make_code_span($code)
{
$code = htmlspecialchars(trim($code), ENT_NOQUOTES);
return $this->hash_part("<code>$code</code>");
}
/**
* Prepare regular expressions for searching emphasis tokens in any
* context.
*/
protected function prepare_italics_and_bold()
{
foreach ($this->em_relist as $em => $em_re) {
foreach ($this->strong_relist as $strong => $strong_re) {
/* Construct list of allowed token expressions. */
$token_relist = array();
if (isset($this->em_strong_relist["$em$strong"])) {
$token_relist[] = $this->em_strong_relist["$em$strong"];
}
$token_relist[] = $em_re;
$token_relist[] = $strong_re;
/* Construct master expression from list. */
$token_re = '{('. implode('|', $token_relist) .')}';
$this->em_strong_prepared_relist["$em$strong"] = $token_re;
}
}
}
protected function do_italics_and_bold($text)
{
$token_stack = array('');
$text_stack = array('');
$em = '';
$strong = '';
$tree_char_em = false;
while (1) {
/* Get prepared regular expression for seraching emphasis tokens in
current context. */
$token_re = $this->em_strong_prepared_relist["$em$strong"];
/* Each loop iteration search for next emphasis token. Each token
then passed to handleSpanToken. */
$parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
$text_stack[0] .= $parts[0];
$token =& $parts[1];
$text =& $parts[2];
if (empty($token)) {
/* Reached end of text span: empty stack without emitting. any
more emphasis. */
while ($token_stack[0]) {
$text_stack[1] .= array_shift($token_stack);
$text_stack[0] .= array_shift($text_stack);
}
break;
}
$token_len = strlen($token);
if ($tree_char_em) {
/* Reached closing marker while inside a three-char emphasis. */
if ($token_len == 3) {
/* Three-char closing marker, close em and strong. */
array_shift($token_stack);
$span = array_shift($text_stack);
$span = $this->run_span_gamut($span);
$span = "<strong><em>$span</em></strong>";
$text_stack[0] .= $this->hash_part($span);
$em = '';
$strong = '';
} else {
/* Other closing marker: close one em or strong and change
current token state to match the other */
$token_stack[0] = str_repeat($token{0}, 3-$token_len);
$tag = $token_len == 2 ? "strong" : "em";
$span = $text_stack[0];
$span = $this->run_span_gamut($span);
$span = "<$tag>$span</$tag>";
$text_stack[0] = $this->hash_part($span);
$$tag = ''; # $$tag stands for $em or $strong
}
$tree_char_em = false;
} else if ($token_len == 3) {
if ($em) {
/* Reached closing marker for both em and strong.
Closing strong marker: */
for ($i = 0; $i < 2; ++$i) {
$shifted_token = array_shift($token_stack);
$tag = strlen($shifted_token) == 2 ? "strong" : "em";
$span = array_shift($text_stack);
$span = $this->run_span_gamut($span);
$span = "<$tag>$span</$tag>";
$text_stack[0] .= $this->hash_part($span);
$$tag = ''; # $$tag stands for $em or $strong
}
} else {
/* Reached opening three-char emphasis marker. Push on token
stack; will be handled by the special condition above. */
$em = $token{0};
$strong = "$em$em";
array_unshift($token_stack, $token);
array_unshift($text_stack, '');
$tree_char_em = true;
}
} else if ($token_len == 2) {
if ($strong) {
/* Unwind any dangling emphasis marker: */
if (strlen($token_stack[0]) == 1) {
$text_stack[1] .= array_shift($token_stack);
$text_stack[0] .= array_shift($text_stack);
}
/* Closing strong marker: */
array_shift($token_stack);
$span = array_shift($text_stack);
$span = $this->run_span_gamut($span);
$span = "<strong>$span</strong>";
$text_stack[0] .= $this->hash_part($span);
$strong = '';
} else {
array_unshift($token_stack, $token);
array_unshift($text_stack, '');
$strong = $token;
}
} else {
/* Here $token_len == 1 */
if ($em) {
if (strlen($token_stack[0]) == 1) {
/* Closing emphasis marker: */
array_shift($token_stack);
$span = array_shift($text_stack);
$span = $this->run_span_gamut($span);
$span = "<em>$span</em>";
$text_stack[0] .= $this->hash_part($span);
$em = '';
} else {
$text_stack[0] .= $token;
}
} else {
array_unshift($token_stack, $token);
array_unshift($text_stack, '');
$em = $token;
}
}
}
return $text_stack[0];
}
protected function do_block_quotes($text)
{
$text = preg_replace_callback('/
( # Wrap whole match in $1
(?>
^[ ]*>[ ]? # ">" at the start of a line
.+\n # rest of the first line
(.+\n)* # subsequent consecutive lines
\n* # blanks
)+
)
/xm',
array(&$this, '_do_block_quotes_callback'), $text);
return $text;
}
protected function _do_block_quotes_callback($matches)
{
$bq = $matches[1];
/* trim one level of quoting - trim whitespace-only lines */
$bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
$bq = $this->run_block_gamut($bq); # recurse
$bq = preg_replace('/^/m', " ", $bq);
/* Leading spaces cause problem with <pre> content, so fix that: */
$bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
array(&$this, '_do_block_quotes_callback2'), $bq);
return "\n". $this->hash_block("<blockquote>\n$bq\n</blockquote>")."\n\n";
}
protected function _do_block_quotes_callback2($matches)
{
$pre = $matches[1];
$pre = preg_replace('/^ /m', '', $pre);
return $pre;
}
/**
* Params:
* $text - string to process with html <p> tags
*/
protected function form_paragraphs($text)
{
/* Strip leading and trailing lines: */
$text = preg_replace('/\A\n+|\n+\z/', '', $text);
$grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
/* Wrap <p> tags and unhashify HTML blocks */
foreach ($grafs as $key => $value) {
if (!preg_match('/^B\x1A[0-9]+B$/', $value)) {
/* Is a paragraph. */
$value = $this->run_span_gamut($value);
$value = preg_replace('/^([ ]*)/', "<p>", $value);
$value .= "</p>";
$grafs[$key] = $this->unhash($value);
}
else {
/* Is a block. Modify elements of @grafs in-place... */
$graf = $value;
$block = $this->html_hashes[$graf];
$graf = $block;
$grafs[$key] = $graf;
}
}
return implode("\n\n", $grafs);
}
/**
* Encode text for a double-quoted HTML attribute. This function
* is *not* suitable for attributes enclosed in single quotes.
*/
protected function encode_attribute($text)
{
$text = $this->encode_amps_and_angles($text);
$text = str_replace('"', '&quot;', $text);
return $text;
}
/**
* Smart processing for ampersands and angle brackets that need to
* be encoded. Valid character entities are left alone unless the
* no-entities mode is set.
*/
protected function encode_amps_and_angles($text)
{
if ($this->no_entities) {
$text = str_replace('&', '&amp;', $text);
} else {
/* Ampersand-encoding based entirely on Nat Irons's Amputator
MT plugin: <http://bumppo.net/projects/amputator/> */
$text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
'&amp;', $text);;
}
/* Encode remaining <'s */
$text = str_replace('<', '&lt;', $text);
return $text;
}
protected function doAutoLinks($text)
{
$text = preg_replace_callback('{<((https?|ftp|dict):[^\'">\s]+)>}i',
array(&$this, '_doAutoLinks_url_callback'), $text);
/* Email addresses: <address@domain.foo> */
$text = preg_replace_callback('{
<
(?:mailto:)?
(
(?:
[-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+
|
".*?"
)
\@
(?:
[-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
|
\[[\d.a-fA-F:]+\] # IPv4 & IPv6
)
)
>
}xi',
array(&$this, '_doAutoLinks_email_callback'), $text);
return $text;
}
protected function _doAutoLinks_url_callback($matches)
{
$url = $this->encode_attribute($matches[1]);
$link = "<a href=\"$url\">$url</a>";
return $this->hash_part($link);
}
protected function _doAutoLinks_email_callback($matches)
{
$address = $matches[1];
$link = $this->encodeEmailAddress($address);
return $this->hash_part($link);
}
/**
* Input: an email address, e.g. "foo@example.com"
*
* Output: the email address as a mailto link, with each character
* of the address encoded as either a decimal or hex entity, in
* the hopes of foiling most address harvesting spam bots. E.g.:
*
* <p><a href="&#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
* &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;
* &#x6d;">&#x66;o&#111;&#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;
* &#101;&#46;&#x63;&#111;&#x6d;</a></p>
*
* Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
* With some optimizations by Milian Wolff.
*/
protected function encodeEmailAddress($addr)
{
$addr = "mailto:" . $addr;
$chars = preg_split('/(?<!^)(?!$)/', $addr);
$seed = (int)abs(crc32($addr) / strlen($addr)); # Deterministic seed.
foreach ($chars as $key => $char) {
$ord = ord($char);
/* Ignore non-ascii chars. */
if ($ord < 128) {
$r = ($seed * (1 + $key)) % 100; # Pseudo-random function.
/* roughly 10% raw, 45% hex, 45% dec '@' *must* be encoded. I insist. */
if ($r > 90 && $char != '@') /* do nothing */;
else if ($r < 45) $chars[$key] = '&#x'.dechex($ord).';';
else $chars[$key] = '&#'.$ord.';';
}
}
$addr = implode('', $chars);
$text = implode('', array_slice($chars, 7)); # text without `mailto:`
$addr = "<a href=\"$addr\">$text</a>";
return $addr;
}
/**
* Take the string $str and parse it into tokens, hashing embeded HTML,
* escaped characters and handling code spans.
*/
protected function parseSpan($str)
{
$output = '';
$span_re = '{
(
\\\\'.$this->escape_chars_re.'
|
(?<![`\\\\])
`+ # code span marker
'.( $this->no_markup ? '' : '
|
<!-- .*? --> # comment
|
<\?.*?\?> | <%.*?%> # processing instruction
|
<[/!$]?[-a-zA-Z0-9:_]+ # regular tags
(?>
\s
(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
)?
>
').'
)
}xs';
while (1) {
#
# Each loop iteration seach for either the next tag, the next
# openning code span marker, or the next escaped character.
# Each token is then passed to handleSpanToken.
#
$parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
# Create token from text preceding tag.
if ($parts[0] != "") {
$output .= $parts[0];
}
# Check if we reach the end.
if (isset($parts[1])) {
$output .= $this->handleSpanToken($parts[1], $parts[2]);
$str = $parts[2];
}
else {
break;
}
}
return $output;
}
/**
* Handle $token provided by parseSpan by determining its nature and
* returning the corresponding value that should replace it.
*/
protected function handleSpanToken($token, &$str)
{
switch ($token{0}) {
case "\\":
return $this->hash_part("&#". ord($token{1}). ";");
case "`":
# Search for end marker in remaining text.
if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm',
$str, $matches))
{
$str = $matches[2];
$codespan = $this->make_code_span($matches[1]);
return $this->hash_part($codespan);
}
return $token; // return as text since no ending marker found.
default:
return $this->hash_part($token);
}
}
/**
* Remove one level of line-leading tabs or spaces
*/
protected function outdent($text)
{
return preg_replace('/^(\t|[ ]{1,'.$this->tab_width.'})/m', '', $text);
}
/**
* Replace tabs with the appropriate amount of space.
*/
protected function detab($text)
{
# For each line we separate the line in blocks delemited by
# tab characters. Then we reconstruct every line by adding the
# appropriate number of space between each blocks.
$text = preg_replace_callback('/^.*\t.*$/m',
array(&$this, '_detab_callback'), $text);
return $text;
}
protected function _detab_callback($matches)
{
$line = $matches[0];
$strlen = $this->utf8_strlen; # strlen function for UTF-8.
# Split in blocks.
$blocks = explode("\t", $line);
# Add each blocks to the line.
$line = $blocks[0];
unset($blocks[0]); # Do not add first block twice.
foreach ($blocks as $block) {
# Calculate amount of space, insert spaces, insert block.
$amount = $this->tab_width -
$strlen($line, 'UTF-8') % $this->tab_width;
$line .= str_repeat(" ", $amount) . $block;
}
return $line;
}
/**
* Check for the availability of the function in the `utf8_strlen` property
* (initially `mb_strlen`). If the function is not available, create a
* function that will loosely count the number of UTF-8 characters with a
* regular expression.
*/
protected function init_detab()
{
if (function_exists($this->utf8_strlen)) return;
$this->utf8_strlen = create_function('$text', 'return preg_match_all(
"/[\\\\x00-\\\\xBF]|[\\\\xC0-\\\\xFF][\\\\x80-\\\\xBF]*/",
$text, $m);');
}
/**
* Swap back in all the tags hashed by _Hash_HTML_blocks.
*/
protected function unhash($text)
{
return preg_replace_callback('/(.)\x1A[0-9]+\1/',
array(&$this, '_unhash_callback'), $text);
}
protected function _unhash_callback($matches)
{
return $this->html_hashes[$matches[0]];
}
}
?>