mirror of
https://github.com/getgrav/grav.git
synced 2025-02-20 19:56:53 +01:00
Revamped the Html Truncator code to address issues with invalid HTML #1019
This commit is contained in:
parent
10825d3f70
commit
a54f30b8ae
|
|
@ -28,7 +28,8 @@
|
|||
"ext-openssl": "*",
|
||||
"ext-curl": "*",
|
||||
"ext-zip": "*",
|
||||
"league/climate": "^3.2"
|
||||
"league/climate": "^3.2",
|
||||
"antoligy/dom-string-iterators": "^1.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"codeception/codeception": "^2.1",
|
||||
|
|
|
|||
48
composer.lock
generated
48
composer.lock
generated
|
|
@ -4,9 +4,53 @@
|
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"hash": "25e59d23a9af7f43dd9cd9d462057abd",
|
||||
"content-hash": "22973a67f2eae64610e739fa82a3d60b",
|
||||
"hash": "7a8caecbaedbf785d96b7437f296ca66",
|
||||
"content-hash": "2fec25b3b5d627c0896d5ee3030b6bed",
|
||||
"packages": [
|
||||
{
|
||||
"name": "antoligy/dom-string-iterators",
|
||||
"version": "v1.0.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/antoligy/dom-string-iterators.git",
|
||||
"reference": "9a624b082493fee9b972840dbd677494edb94cf7"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/antoligy/dom-string-iterators/zipball/9a624b082493fee9b972840dbd677494edb94cf7",
|
||||
"reference": "9a624b082493fee9b972840dbd677494edb94cf7",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": ">=5.3.0"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"": "src/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"Public Domain"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Alex Wilson",
|
||||
"email": "a@ax.gy"
|
||||
},
|
||||
{
|
||||
"name": "Kornel Lesinski",
|
||||
"email": "pornel@pornel.net"
|
||||
},
|
||||
{
|
||||
"name": "Patrick Galbraith",
|
||||
"email": "patrick.j.galbraith@gmail.com"
|
||||
}
|
||||
],
|
||||
"description": "Composer package for DOMWordsIterator and DOMLettersIterator",
|
||||
"time": "2015-11-04 17:33:14"
|
||||
},
|
||||
{
|
||||
"name": "doctrine/cache",
|
||||
"version": "v1.6.0",
|
||||
|
|
|
|||
|
|
@ -8,12 +8,16 @@
|
|||
|
||||
namespace Grav\Common\Helpers;
|
||||
|
||||
use DOMText;
|
||||
use DOMDocument;
|
||||
use DOMWordsIterator;
|
||||
use DOMLettersIterator;
|
||||
|
||||
/**
|
||||
* This file is part of urodoz/truncateHTML.
|
||||
* This file is part of https://github.com/Bluetel-Solutions/twig-truncate-extension
|
||||
*
|
||||
* (c) Albert Lacarta <urodoz@gmail.com>
|
||||
* Copyright (c) 2015 Bluetel Solutions developers@bluetel.co.uk
|
||||
* Copyright (c) 2015 Alex Wilson ajw@bluetel.co.uk
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
|
|
@ -21,181 +25,188 @@ use DOMDocument;
|
|||
|
||||
class Truncator {
|
||||
|
||||
public static $default_options = array(
|
||||
'ellipsis' => '…',
|
||||
'break' => ' ',
|
||||
'length_in_chars' => false,
|
||||
'word_safe' => false,
|
||||
);
|
||||
/**
|
||||
* Safely truncates HTML by a given number of words.
|
||||
* @param string $html Input HTML.
|
||||
* @param integer $limit Limit to how many words we preserve.
|
||||
* @param string $ellipsis String to use as ellipsis (if any).
|
||||
* @return string Safe truncated HTML.
|
||||
*/
|
||||
public static function truncateWords($html, $limit = 0, $ellipsis = "")
|
||||
{
|
||||
if ($limit <= 0) {
|
||||
return $html;
|
||||
}
|
||||
|
||||
// These tags are allowed to have an ellipsis inside
|
||||
public static $ellipsable_tags = array(
|
||||
'p', 'ol', 'ul', 'li',
|
||||
'div', 'header', 'article', 'nav',
|
||||
'section', 'footer', 'aside',
|
||||
'dd', 'dt', 'dl',
|
||||
);
|
||||
$dom = self::htmlToDomDocument($html);
|
||||
|
||||
public static $self_closing_tags = array(
|
||||
'br', 'hr', 'img',
|
||||
);
|
||||
// Grab the body of our DOM.
|
||||
$body = $dom->getElementsByTagName("body")->item(0);
|
||||
|
||||
// Iterate over words.
|
||||
$words = new DOMWordsIterator($body);
|
||||
foreach ($words as $word) {
|
||||
|
||||
// If we have exceeded the limit, we delete the remainder of the content.
|
||||
if ($words->key() >= $limit) {
|
||||
|
||||
// Grab current position.
|
||||
$currentWordPosition = $words->currentWordPosition();
|
||||
$curNode = $currentWordPosition[0];
|
||||
$offset = $currentWordPosition[1];
|
||||
$words = $currentWordPosition[2];
|
||||
|
||||
$curNode->nodeValue = substr(
|
||||
$curNode->nodeValue,
|
||||
0,
|
||||
$words[$offset][1] + strlen($words[$offset][0])
|
||||
);
|
||||
|
||||
self::removeProceedingNodes($curNode, $body);
|
||||
|
||||
if (!empty($ellipsis)) {
|
||||
self::insertEllipsis($curNode, $ellipsis);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return self::innerHTML($body);
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate given HTML string to specified length.
|
||||
* If length_in_chars is false it's trimmed by number
|
||||
* of words, otherwise by number of characters.
|
||||
*
|
||||
* @param string $html
|
||||
* @param integer $length
|
||||
* @param string|array $opts
|
||||
* @return string
|
||||
* Safely truncates HTML by a given number of letters.
|
||||
* @param string $html Input HTML.
|
||||
* @param integer $limit Limit to how many letters we preserve.
|
||||
* @param string $ellipsis String to use as ellipsis (if any).
|
||||
* @return string Safe truncated HTML.
|
||||
*/
|
||||
public static function truncate($html, $length, $opts=array())
|
||||
public static function truncateLetters($html, $limit = 0, $ellipsis = "")
|
||||
{
|
||||
if (is_string($opts)) $opts = array('ellipsis' => $opts);
|
||||
$opts = array_merge(static::$default_options, $opts);
|
||||
// wrap the html in case it consists of adjacent nodes like <p>foo</p><p>bar</p>
|
||||
$html = mb_convert_encoding("<div>".$html."</div>", 'HTML-ENTITIES', 'UTF-8');
|
||||
|
||||
$root_node = null;
|
||||
// Parse using HTML5Lib if it's available.
|
||||
if (class_exists('HTML5Lib\\Parser')) {
|
||||
try {
|
||||
$doc = \HTML5Lib\Parser::parse($html);
|
||||
$root_node = $doc->documentElement->lastChild->lastChild;
|
||||
}
|
||||
catch (\Exception $e) {
|
||||
;
|
||||
}
|
||||
if ($limit <= 0) {
|
||||
return $html;
|
||||
}
|
||||
if ($root_node === null) {
|
||||
// HTML5Lib not available so we'll have to use DOMDocument
|
||||
// We'll only be able to parse HTML5 if it's valid XML
|
||||
$doc = new DOMDocument('4.01', 'utf-8');
|
||||
$doc->formatOutput = false;
|
||||
$doc->preserveWhiteSpace = true;
|
||||
// loadHTML will fail with HTML5 tags (article, nav, etc)
|
||||
// so we need to suppress errors and if it fails to parse we
|
||||
// retry with the XML parser instead
|
||||
$prev_use_errors = libxml_use_internal_errors(true);
|
||||
if ($doc->loadHTML($html)) {
|
||||
$root_node = $doc->documentElement->lastChild->lastChild;
|
||||
}
|
||||
else if ($doc->loadXML($html)) {
|
||||
$root_node = $doc->documentElement;
|
||||
}
|
||||
else {
|
||||
libxml_use_internal_errors($prev_use_errors);
|
||||
throw new \RuntimeException;
|
||||
}
|
||||
libxml_use_internal_errors($prev_use_errors);
|
||||
}
|
||||
list($text, $_, $opts) = static::truncateNode($doc, $root_node, $length, $opts);
|
||||
|
||||
$text = mb_substr(mb_substr($text, 0, -6), 5);
|
||||
$dom = self::htmlToDomDocument($html);
|
||||
|
||||
return $text;
|
||||
}
|
||||
// Grab the body of our DOM.
|
||||
$body = $dom->getElementsByTagName("body")->item(0);
|
||||
|
||||
protected static function truncateNode($doc, $node, $length, $opts)
|
||||
{
|
||||
if ($length === 0 && !static::ellipsable($node)) {
|
||||
return array('', 1, $opts);
|
||||
}
|
||||
list($inner, $remaining, $opts) = static::innerTruncate($doc, $node, $length, $opts);
|
||||
if (0 === mb_strlen($inner)) {
|
||||
return array(in_array(mb_strtolower($node->nodeName), static::$self_closing_tags) ? $doc->saveXML($node) : "", $length - $remaining, $opts);
|
||||
}
|
||||
while($node->firstChild) {
|
||||
$node->removeChild($node->firstChild);
|
||||
}
|
||||
$newNode = $doc->createDocumentFragment();
|
||||
// handle the ampersand
|
||||
$newNode->appendXml(static::xmlEscape($inner));
|
||||
$node->appendChild($newNode);
|
||||
return array($doc->saveXML($node), $length - $remaining, $opts);
|
||||
}
|
||||
// Iterate over letters.
|
||||
$letters = new DOMLettersIterator($body);
|
||||
foreach ($letters as $letter) {
|
||||
|
||||
protected static function innerTruncate($doc, $node, $length, $opts)
|
||||
{
|
||||
$inner = '';
|
||||
$remaining = $length;
|
||||
foreach($node->childNodes as $childNode) {
|
||||
if ($childNode->nodeType === XML_ELEMENT_NODE) {
|
||||
list($txt, $nb, $opts) = static::truncateNode($doc, $childNode, $remaining, $opts);
|
||||
}
|
||||
else if ($childNode->nodeType === XML_TEXT_NODE) {
|
||||
list($txt, $nb, $opts) = static::truncateText($childNode, $remaining, $opts);
|
||||
} else {
|
||||
$txt = '';
|
||||
$nb = 0;
|
||||
}
|
||||
// If we have exceeded the limit, we want to delete the remainder of this document.
|
||||
if ($letters->key() >= $limit) {
|
||||
|
||||
// unhandle the ampersand
|
||||
$txt = static::xmlUnescape($txt);
|
||||
$currentText = $letters->currentTextPosition();
|
||||
$currentText[0]->nodeValue = substr($currentText[0]->nodeValue, 0, $currentText[1] + 1);
|
||||
self::removeProceedingNodes($currentText[0], $body);
|
||||
|
||||
$remaining -= $nb;
|
||||
$inner .= $txt;
|
||||
if ($remaining < 0) {
|
||||
if (static::ellipsable($node)) {
|
||||
$inner = preg_replace('/(?:[\s\pP]+|(?:&(?:[a-z]+|#[0-9]+);?))*$/u', '', $inner).$opts['ellipsis'];
|
||||
$opts['ellipsis'] = '';
|
||||
$opts['was_truncated'] = true;
|
||||
if (!empty($ellipsis)) {
|
||||
self::insertEllipsis($currentText[0], $ellipsis);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
return array($inner, $remaining, $opts);
|
||||
|
||||
return self::innerHTML($body);
|
||||
}
|
||||
|
||||
protected static function truncateText($node, $length, $opts)
|
||||
/**
|
||||
* Builds a DOMDocument object from a string containing HTML.
|
||||
* @param string HTML to load
|
||||
* @returns DOMDocument Returns a DOMDocument object.
|
||||
*/
|
||||
public static function htmlToDomDocument($html)
|
||||
{
|
||||
$string = $node->textContent;
|
||||
// Transform multibyte entities which otherwise display incorrectly.
|
||||
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
|
||||
|
||||
if ($opts['length_in_chars']) {
|
||||
$count = mb_strlen($string);
|
||||
if ($count <= $length && $length > 0) {
|
||||
return array($string, $count, $opts);
|
||||
}
|
||||
if ($opts['word_safe']) {
|
||||
if (false !== ($breakpoint = mb_strpos($string, $opts['break'], $length))) {
|
||||
if ($breakpoint < mb_strlen($string) - 1) {
|
||||
$string = mb_substr($string, 0, $breakpoint) . $opts['break'];
|
||||
}
|
||||
// Internal errors enabled as HTML5 not fully supported.
|
||||
libxml_use_internal_errors(true);
|
||||
|
||||
// Instantiate new DOMDocument object, and then load in UTF-8 HTML.
|
||||
$dom = new DOMDocument();
|
||||
$dom->encoding = 'UTF-8';
|
||||
$dom->loadHTML($html);
|
||||
|
||||
return $dom;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all nodes after the current node.
|
||||
* @param DOMNode|DOMElement $domNode
|
||||
* @param DOMNode|DOMElement $topNode
|
||||
* @return void
|
||||
*/
|
||||
private static function removeProceedingNodes($domNode, $topNode)
|
||||
{
|
||||
$nextNode = $domNode->nextSibling;
|
||||
|
||||
if ($nextNode !== null) {
|
||||
self::removeProceedingNodes($nextNode, $topNode);
|
||||
$domNode->parentNode->removeChild($nextNode);
|
||||
} else {
|
||||
//scan upwards till we find a sibling
|
||||
$curNode = $domNode->parentNode;
|
||||
while ($curNode !== $topNode) {
|
||||
if ($curNode->nextSibling !== null) {
|
||||
$curNode = $curNode->nextSibling;
|
||||
self::removeProceedingNodes($curNode, $topNode);
|
||||
$curNode->parentNode->removeChild($curNode);
|
||||
break;
|
||||
}
|
||||
return array($string, $count, $opts);
|
||||
$curNode = $curNode->parentNode;
|
||||
}
|
||||
return array(mb_substr($node->textContent, 0, $length), $count, $opts);
|
||||
}
|
||||
else {
|
||||
preg_match_all('/\s*\S+/', $string, $words);
|
||||
$words = $words[0];
|
||||
$count = count($words);
|
||||
if ($count <= $length && $length > 0) {
|
||||
return array($string, $count, $opts);
|
||||
}
|
||||
return array(implode('', array_slice($words, 0, $length)), $count, $opts);
|
||||
}
|
||||
}
|
||||
|
||||
protected static function ellipsable($node)
|
||||
/**
|
||||
* Inserts an ellipsis
|
||||
* @param DOMNode|DOMElement $domNode Element to insert after.
|
||||
* @param string $ellipsis Text used to suffix our document.
|
||||
* @return void
|
||||
*/
|
||||
private static function insertEllipsis($domNode, $ellipsis)
|
||||
{
|
||||
return ($node instanceof DOMDocument)
|
||||
|| in_array(mb_strtolower($node->nodeName), static::$ellipsable_tags)
|
||||
;
|
||||
$avoid = array('a', 'strong', 'em', 'h1', 'h2', 'h3', 'h4', 'h5'); //html tags to avoid appending the ellipsis to
|
||||
|
||||
if (in_array($domNode->parentNode->nodeName, $avoid) && $domNode->parentNode->parentNode !== null) {
|
||||
// Append as text node to parent instead
|
||||
$textNode = new DOMText($ellipsis);
|
||||
|
||||
if ($domNode->parentNode->parentNode->nextSibling) {
|
||||
$domNode->parentNode->parentNode->insertBefore($textNode, $domNode->parentNode->parentNode->nextSibling);
|
||||
} else {
|
||||
$domNode->parentNode->parentNode->appendChild($textNode);
|
||||
}
|
||||
|
||||
} else {
|
||||
// Append to current node
|
||||
$domNode->nodeValue = rtrim($domNode->nodeValue) . $ellipsis;
|
||||
}
|
||||
}
|
||||
|
||||
protected static function xmlEscape($string)
|
||||
{
|
||||
$string = str_replace('&', '&', $string);
|
||||
$string = str_replace('<?', '<?', $string);
|
||||
return $string;
|
||||
/**
|
||||
* Returns the innerHTML of a particular DOMElement
|
||||
*
|
||||
* @param $element
|
||||
* @return string
|
||||
*/
|
||||
private static function innerHTML($element) {
|
||||
$innerHTML = "";
|
||||
$children = $element->childNodes;
|
||||
foreach ($children as $child)
|
||||
{
|
||||
$tmp_dom = new DOMDocument();
|
||||
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
|
||||
$innerHTML.=trim($tmp_dom->saveHTML());
|
||||
}
|
||||
return $innerHTML;
|
||||
}
|
||||
|
||||
protected static function xmlUnescape($string)
|
||||
{
|
||||
$string = str_replace('&', '&', $string);
|
||||
$string = str_replace('<?', '<?', $string);
|
||||
return $string;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -502,7 +502,8 @@ class Page
|
|||
$size = 300;
|
||||
}
|
||||
|
||||
return html_entity_decode(Utils::truncateHTML($content, $size));
|
||||
$summary = Utils::truncateHTML($content, $size);
|
||||
return html_entity_decode($summary);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -183,26 +183,28 @@ abstract class Utils
|
|||
* Truncate HTML by number of characters. not "word-safe"!
|
||||
*
|
||||
* @param string $text
|
||||
* @param int $length
|
||||
* @param int $length in characters
|
||||
* @param string $ellipsis
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function truncateHtml($text, $length = 100)
|
||||
public static function truncateHtml($text, $length = 100, $ellipsis = '...')
|
||||
{
|
||||
return Truncator::truncate($text, $length, ['length_in_chars' => true]);
|
||||
return Truncator::truncateLetters($text, $length, $ellipsis);
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate HTML by number of characters in a "word-safe" manor.
|
||||
*
|
||||
* @param string $text
|
||||
* @param int $length
|
||||
* @param int $length in words
|
||||
* @param string $ellipsis
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function safeTruncateHtml($text, $length = 100)
|
||||
public static function safeTruncateHtml($text, $length = 25, $ellipsis = '...')
|
||||
{
|
||||
return Truncator::truncate($text, $length, ['length_in_chars' => true, 'word_safe' => true]);
|
||||
return Truncator::truncateWords($text, $length, $ellipsis);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -123,16 +123,22 @@ class UtilsTest extends \Codeception\TestCase\Test
|
|||
|
||||
public function testTruncateHtml()
|
||||
{
|
||||
$this->assertEquals('<p>T…</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 1));
|
||||
$this->assertEquals('<p>This…</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 4));
|
||||
$this->assertEquals('', Utils::truncateHtml('<input type="file" id="file" multiple />', 6, true));
|
||||
|
||||
$this->assertEquals('<p>T...</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 1));
|
||||
$this->assertEquals('<p>This...</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 4));
|
||||
$this->assertEquals('<p>This is a...</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 10));
|
||||
$this->assertEquals('<p>This is a string to truncate</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 100));
|
||||
$this->assertEquals('<input type="file" id="file" multiple>', Utils::truncateHtml('<input type="file" id="file" multiple />', 6));
|
||||
$this->assertEquals('<ol><li>item 1 <i>so...</i></li></ol>', Utils::truncateHtml('<ol><li>item 1 <i>something</i></li><li>item 2 <strong>bold</strong></li></ol>', 10));
|
||||
}
|
||||
|
||||
public function testSafeTruncateHtml()
|
||||
{
|
||||
$this->assertEquals('<p>This…</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 1));
|
||||
$this->assertEquals('<p>This…</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 4));
|
||||
$this->assertEquals('<p>This...</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 1));
|
||||
$this->assertEquals('<p>This is...</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 2));
|
||||
$this->assertEquals('<p>This is a string to...</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 5));
|
||||
$this->assertEquals('<p>This is a string to truncate</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 20));
|
||||
$this->assertEquals('<input type="file" id="file" multiple>', Utils::safeTruncateHtml('<input type="file" id="file" multiple />', 6));
|
||||
$this->assertEquals('<ol><li>item 1 <i>something</i></li><li>item 2...</li></ol>', Utils::safeTruncateHtml('<ol><li>item 1 <i>something</i></li><li>item 2 <strong>bold</strong></li></ol>', 5));
|
||||
}
|
||||
|
||||
public function testGenerateRandomString()
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user